Reproduction of Numbers quoted in OpenCL Programming Guides by AMD and NVIDIA
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

2478 lines
97 KiB

/*
* This file is part of clBandwidth.
*
* clBandwidth is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* clBandwidth is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with clBandwidth. If not, see <http://www.gnu.org/licenses/>.
*
* (c) 2011 Matthias Bach <bach@compeng.uni-frankfurt.de>
*/
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define DOUBLE_ENABLED
#else /* cl_khr_fp64 */
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#define DOUBLE_ENABLED
#endif /* cl_amd_fp64 */
#endif /* cl_khr_fp64 */
/*
* float kernels
*/
__kernel void copyFloat(__global float * out, __global float * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readFloat(__global float * out, __global float * in, const ulong elems)
{
float tmp = 0.0f;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp += in[i];
}
out[get_global_id(0)] = tmp;
}
__kernel void writeFloat(__global float * out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in;
}
}
__kernel void copyFloatRestricted(__global float * const restrict out, __global const float * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readFloatRestricted(__global float * const restrict out, __global const float * const restrict in, const ulong elems)
{
float tmp = 0.0f;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp += in[i];
}
out[get_global_id(0)] = tmp;
}
__kernel void writeFloatRestricted(__global float * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in;
}
}
__kernel void copyFloat2(__global float2 * out, __global float2 * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readFloat2(__global float2 * out, __global float2 * in, const ulong elems)
{
float2 tmp = 0.0f;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp += in[i];
}
out[get_global_id(0)] = tmp;
}
__kernel void writeFloat2(__global float2 * out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in;
}
}
__kernel void copyFloat4(__global float4 * out, __global float4 * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readFloat4(__global float4 * out, __global float4 * in, const ulong elems)
{
float4 tmp = 0.0f;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp += in[i];
}
out[get_global_id(0)] = tmp;
}
__kernel void writeFloat4(__global float4 * out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in;
}
}
/*
* Single precision complex
*/
typedef struct { float re; float im; } spComplex;
spComplex make_spComplex(const float re, const float im) {
return (spComplex) {re, im};
}
spComplex spComplexAdd(const spComplex left, const spComplex right) {
return make_spComplex(left.re + right.re, left.im + right.im);
}
__kernel void copySpComplex(__global spComplex * out, __global spComplex * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readSpComplex(__global spComplex * out, __global spComplex * in, const ulong elems)
{
spComplex tmp = make_spComplex(0.0f, 0.0f);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spComplexAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeSpComplex(__global spComplex * out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = make_spComplex(in, in);
}
}
__kernel void copySpComplexRestricted(__global spComplex * const restrict out, __global const spComplex * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readSpComplexRestricted(__global spComplex * const restrict out, __global const spComplex * const restrict in, const ulong elems)
{
spComplex tmp = make_spComplex(0.0f, 0.0f);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spComplexAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeSpComplexRestricted(__global spComplex * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = make_spComplex(in, in);
}
}
typedef struct { float re; float im; } __attribute__ ((aligned (8))) alignedSpComplex;
alignedSpComplex make_alignedSpComplex(const float re, const float im) {
return (alignedSpComplex) {re, im};
}
alignedSpComplex alignedSpComplexAdd(const alignedSpComplex left, const alignedSpComplex right) {
return make_alignedSpComplex(left.re + right.re, left.im + right.im);
}
__kernel void copyAlignedSpComplex(__global alignedSpComplex * out, __global alignedSpComplex * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAlignedSpComplex(__global alignedSpComplex * out, __global alignedSpComplex * in, const ulong elems)
{
alignedSpComplex tmp = make_alignedSpComplex(0.0f, 0.0f);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = alignedSpComplexAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAlignedSpComplex(__global alignedSpComplex * out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = make_alignedSpComplex(in, in);
}
}
__kernel void copyAlignedSpComplexRestricted(__global alignedSpComplex * const restrict out, __global const alignedSpComplex * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAlignedSpComplexRestricted(__global alignedSpComplex * const restrict out, __global const alignedSpComplex * const restrict in, const ulong elems)
{
alignedSpComplex tmp = make_alignedSpComplex(0.0f, 0.0f);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = alignedSpComplexAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAlignedSpComplexRestricted(__global alignedSpComplex * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = make_alignedSpComplex(in, in);
}
}
/*
* Single precisoin SU3 vectors
*/
typedef struct {
spComplex e0;
spComplex e1;
spComplex e2;
} spSu3vec;
spSu3vec make_spSu3vec(const spComplex e0, const spComplex e1, const spComplex e2) {
return (spSu3vec) {e0, e1, e2};
}
spSu3vec spSu3vecAdd(const spSu3vec left, const spSu3vec right) {
return make_spSu3vec(
spComplexAdd(left.e0, right.e0),
spComplexAdd(left.e1, right.e1),
spComplexAdd(left.e2, right.e2)
);
}
__kernel void copySpSu3vec(__global spSu3vec * out, __global spSu3vec * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readSpSu3vec(__global spSu3vec * out, __global spSu3vec * in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
spSu3vec tmp = make_spSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSu3vecAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeSpSu3vec(__global spSu3vec * out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
out[i] = make_spSu3vec(bla, bla, bla);
}
}
__kernel void copySpSu3vecRestricted(__global spSu3vec * const restrict out, __global const spSu3vec * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readSpSu3vecRestricted(__global spSu3vec * const restrict out, __global const spSu3vec * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
spSu3vec tmp = make_spSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSu3vecAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeSpSu3vecRestricted(__global spSu3vec * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
out[i] = make_spSu3vec(bla, bla, bla);
}
}
typedef struct {
spComplex e0;
spComplex e1;
spComplex e2;
} __attribute((aligned)) alignedSpSu3vec;
alignedSpSu3vec make_alignedSpSu3vec(const spComplex e0, const spComplex e1, const spComplex e2) {
return (alignedSpSu3vec) {e0, e1, e2};
}
alignedSpSu3vec alignedSpSu3vecAdd(const alignedSpSu3vec left, const alignedSpSu3vec right) {
return make_alignedSpSu3vec(
spComplexAdd(left.e0, right.e0),
spComplexAdd(left.e1, right.e1),
spComplexAdd(left.e2, right.e2)
);
}
__kernel void copyAlignedSpSu3vecRestricted(__global alignedSpSu3vec * const restrict out, __global const alignedSpSu3vec * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAlignedSpSu3vecRestricted(__global alignedSpSu3vec * const restrict out, __global const alignedSpSu3vec * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
alignedSpSu3vec tmp = make_alignedSpSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = alignedSpSu3vecAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAlignedSpSu3vecRestricted(__global alignedSpSu3vec * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
out[i] = make_alignedSpSu3vec(bla, bla, bla);
}
}
typedef struct {
spComplex e0;
spComplex e1;
spComplex e2;
} __attribute((aligned(8))) aligned8SpSu3vec;
aligned8SpSu3vec make_aligned8SpSu3vec(const spComplex e0, const spComplex e1, const spComplex e2) {
return (aligned8SpSu3vec) {e0, e1, e2};
}
aligned8SpSu3vec aligned8SpSu3vecAdd(const aligned8SpSu3vec left, const aligned8SpSu3vec right) {
return make_aligned8SpSu3vec(
spComplexAdd(left.e0, right.e0),
spComplexAdd(left.e1, right.e1),
spComplexAdd(left.e2, right.e2)
);
}
__kernel void copyAligned8SpSu3vecRestricted(__global aligned8SpSu3vec * const restrict out, __global const aligned8SpSu3vec * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAligned8SpSu3vecRestricted(__global aligned8SpSu3vec * const restrict out, __global const aligned8SpSu3vec * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
aligned8SpSu3vec tmp = make_aligned8SpSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = aligned8SpSu3vecAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAligned8SpSu3vecRestricted(__global aligned8SpSu3vec * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
out[i] = make_aligned8SpSu3vec(bla, bla, bla);
}
}
typedef struct {
spComplex e0;
spComplex e1;
spComplex e2;
} __attribute((aligned(16))) aligned16SpSu3vec;
aligned16SpSu3vec make_aligned16SpSu3vec(const spComplex e0, const spComplex e1, const spComplex e2) {
return (aligned16SpSu3vec) {e0, e1, e2};
}
aligned16SpSu3vec aligned16SpSu3vecAdd(const aligned16SpSu3vec left, const aligned16SpSu3vec right) {
return make_aligned16SpSu3vec(
spComplexAdd(left.e0, right.e0),
spComplexAdd(left.e1, right.e1),
spComplexAdd(left.e2, right.e2)
);
}
__kernel void copyAligned16SpSu3vecRestricted(__global aligned16SpSu3vec * const restrict out, __global const aligned16SpSu3vec * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAligned16SpSu3vecRestricted(__global aligned16SpSu3vec * const restrict out, __global const aligned16SpSu3vec * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
aligned16SpSu3vec tmp = make_aligned16SpSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = aligned16SpSu3vecAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAligned16SpSu3vecRestricted(__global aligned16SpSu3vec * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
out[i] = make_aligned16SpSu3vec(bla, bla, bla);
}
}
typedef struct {
spComplex e0;
spComplex e1;
spComplex e2;
} __attribute((aligned(32))) aligned32SpSu3vec;
aligned32SpSu3vec make_aligned32SpSu3vec(const spComplex e0, const spComplex e1, const spComplex e2) {
return (aligned32SpSu3vec) {e0, e1, e2};
}
aligned32SpSu3vec aligned32SpSu3vecAdd(const aligned32SpSu3vec left, const aligned32SpSu3vec right) {
return make_aligned32SpSu3vec(
spComplexAdd(left.e0, right.e0),
spComplexAdd(left.e1, right.e1),
spComplexAdd(left.e2, right.e2)
);
}
__kernel void copyAligned32SpSu3vecRestricted(__global aligned32SpSu3vec * const restrict out, __global const aligned32SpSu3vec * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAligned32SpSu3vecRestricted(__global aligned32SpSu3vec * const restrict out, __global const aligned32SpSu3vec * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
aligned32SpSu3vec tmp = make_aligned32SpSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = aligned32SpSu3vecAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAligned32SpSu3vecRestricted(__global aligned32SpSu3vec * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
out[i] = make_aligned32SpSu3vec(bla, bla, bla);
}
}
typedef struct {
alignedSpComplex e0;
alignedSpComplex e1;
alignedSpComplex e2;
} spSu3vecFromAligned;
spSu3vecFromAligned make_spSu3vecFromAligned(const alignedSpComplex e0, const alignedSpComplex e1, const alignedSpComplex e2) {
return (spSu3vecFromAligned) {e0, e1, e2};
}
spSu3vecFromAligned spSu3vecFromAlignedAdd(const spSu3vecFromAligned left, const spSu3vecFromAligned right) {
return make_spSu3vecFromAligned(
alignedSpComplexAdd(left.e0, right.e0),
alignedSpComplexAdd(left.e1, right.e1),
alignedSpComplexAdd(left.e2, right.e2)
);
}
__kernel void copySpSu3vecFromAlignedRestricted(__global spSu3vecFromAligned * const restrict out, __global const spSu3vecFromAligned * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readSpSu3vecFromAlignedRestricted(__global spSu3vecFromAligned * const restrict out, __global const spSu3vecFromAligned * const restrict in, const ulong elems)
{
alignedSpComplex bla = make_alignedSpComplex(0.0f, 0.0f);
spSu3vecFromAligned tmp = make_spSu3vecFromAligned(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSu3vecFromAlignedAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeSpSu3vecFromAlignedRestricted(__global spSu3vecFromAligned * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedSpComplex bla = make_alignedSpComplex(in, in);
out[i] = make_spSu3vecFromAligned(bla, bla, bla);
}
}
/*
* Single precision SU3
*/
typedef struct {
spComplex e00, e01, e02;
spComplex e10, e11, e12;
spComplex e20, e21, e22;
} spSu3;
spSu3 make_spSu3(const spComplex e00, const spComplex e01, const spComplex e02,
const spComplex e10, const spComplex e11, const spComplex e12,
const spComplex e20, const spComplex e21, const spComplex e22) {
return (spSu3) {e00, e01, e02,
e10, e11, e12,
e20, e21, e22};
}
spSu3 spSu3Add(const spSu3 left, const spSu3 right) {
return make_spSu3(
spComplexAdd(left.e00, right.e00),
spComplexAdd(left.e01, right.e01),
spComplexAdd(left.e02, right.e02),
spComplexAdd(left.e10, right.e10),
spComplexAdd(left.e11, right.e11),
spComplexAdd(left.e12, right.e12),
spComplexAdd(left.e20, right.e20),
spComplexAdd(left.e21, right.e21),
spComplexAdd(left.e22, right.e22)
);
}
__kernel void copySpSu3(__global spSu3 * out, __global spSu3 * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readSpSu3(__global spSu3 * out, __global spSu3 * in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
spSu3 tmp = make_spSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSu3Add(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeSpSu3(__global spSu3 * out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
out[i] = make_spSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
}
}
__kernel void copySpSu3Restricted(__global spSu3 * const restrict out, __global const spSu3 * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readSpSu3Restricted(__global spSu3 * const restrict out, __global const spSu3 * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
spSu3 tmp = make_spSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSu3Add(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeSpSu3Restricted(__global spSu3 * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
out[i] = make_spSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
}
}
spSu3 getSpSu3SOA(__global const spComplex * const restrict in, const size_t i, const size_t stride)
{
return make_spSu3(in[0 * stride + i], in[1 * stride + i], in[2 * stride + i],
in[3 * stride + i], in[4 * stride + i], in[5 * stride + i],
in[6 * stride + i], in[7 * stride + i], in[8 * stride + i]);
}
void putSpSu3SOA(__global spComplex * const restrict out, const size_t i, const spSu3 val, const size_t stride)
{
out[0 * stride + i] = val.e00;
out[1 * stride + i] = val.e01;
out[2 * stride + i] = val.e02;
out[3 * stride + i] = val.e10;
out[4 * stride + i] = val.e11;
out[5 * stride + i] = val.e12;
out[6 * stride + i] = val.e20;
out[7 * stride + i] = val.e21;
out[8 * stride + i] = val.e22;
}
__kernel void copySpSu3SOARestricted(__global spComplex * const restrict out, __global const spComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spSu3 tmp = getSpSu3SOA(in, i, stride);
putSpSu3SOA(out, i, tmp, stride);
}
}
__kernel void readSpSu3SOARestricted(__global spComplex * const restrict out, __global const spComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
spComplex bla = make_spComplex(0.0f, 0.0f);
spSu3 tmp = make_spSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSu3Add(tmp, getSpSu3SOA(in, i, stride));
}
putSpSu3SOA(out, get_global_id(0), tmp, stride);
}
__kernel void writeSpSu3SOARestricted(__global spComplex * const restrict out, const float in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
spSu3 tmp = make_spSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
putSpSu3SOA(out, i, tmp, stride);
}
}
spSu3 getSpSu3ViaLocal(__global const spSu3 * const restrict in, const size_t block, __local spSu3 * const restrict scratch)
{
event_t event = async_work_group_copy((__local float2 *) scratch, (__global float2 *) &in[block * get_local_size(0)], get_local_size(0) * 9, 0);
wait_group_events(1, &event);
return scratch[get_local_id(0)];
}
void putSpSu3ViaLocal(__global spSu3 * const restrict out, const size_t block, const spSu3 val, __local spSu3 * const restrict scratch)
{
scratch[get_local_id(0)] = val;
barrier(CLK_LOCAL_MEM_FENCE);
event_t event = async_work_group_copy((__global float2 *) &out[block * get_local_size(0)], (__local float2 *) scratch, get_local_size(0) * 9, 0);
wait_group_events(1, &event);
}
__kernel void copySpSu3ViaLocalRestricted(__global spSu3 * const restrict out, __global const spSu3 * const restrict in, const ulong elems, __local spSu3 * const restrict scratch)
{
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
spSu3 tmp = getSpSu3ViaLocal(in, i, scratch);
putSpSu3ViaLocal(out, i, tmp, scratch);
}
}
__kernel void readSpSu3ViaLocalRestricted(__global spSu3 * const restrict out, __global const spSu3 * const restrict in, const ulong elems, __local spSu3 * const restrict scratch)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
spSu3 tmp = make_spSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
tmp = spSu3Add(tmp, getSpSu3ViaLocal(in, i, scratch));
}
putSpSu3ViaLocal(out, get_global_id(0), tmp, scratch);
}
__kernel void writeSpSu3ViaLocalRestricted(__global spSu3 * const restrict out, const float in, const ulong elems, __local spSu3 * const restrict scratch)
{
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
spComplex bla = make_spComplex(in, in);
spSu3 tmp = make_spSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
putSpSu3ViaLocal(out, i, tmp, scratch);
}
}
typedef struct {
alignedSpComplex e00, e01, e02;
alignedSpComplex e10, e11, e12;
alignedSpComplex e20, e21, e22;
} spSu3FromAligned;
spSu3FromAligned make_spSu3FromAligned(const alignedSpComplex e00, const alignedSpComplex e01, const alignedSpComplex e02,
const alignedSpComplex e10, const alignedSpComplex e11, const alignedSpComplex e12,
const alignedSpComplex e20, const alignedSpComplex e21, const alignedSpComplex e22) {
return (spSu3FromAligned) {e00, e01, e02,
e10, e11, e12,
e20, e21, e22};
}
spSu3FromAligned spSu3FromAlignedAdd(const spSu3FromAligned left, const spSu3FromAligned right) {
return make_spSu3FromAligned(
alignedSpComplexAdd(left.e00, right.e00),
alignedSpComplexAdd(left.e01, right.e01),
alignedSpComplexAdd(left.e02, right.e02),
alignedSpComplexAdd(left.e10, right.e10),
alignedSpComplexAdd(left.e11, right.e11),
alignedSpComplexAdd(left.e12, right.e12),
alignedSpComplexAdd(left.e20, right.e20),
alignedSpComplexAdd(left.e21, right.e21),
alignedSpComplexAdd(left.e22, right.e22)
);
}
__kernel void copySpSu3FromAlignedRestricted(__global spSu3FromAligned * const restrict out, __global const spSu3FromAligned * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readSpSu3FromAlignedRestricted(__global spSu3FromAligned * const restrict out, __global const spSu3FromAligned * const restrict in, const ulong elems)
{
alignedSpComplex bla = make_alignedSpComplex(0.0f, 0.0f);
spSu3FromAligned tmp = make_spSu3FromAligned(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSu3FromAlignedAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeSpSu3FromAlignedRestricted(__global spSu3FromAligned * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedSpComplex bla = make_alignedSpComplex(in, in);
out[i] = make_spSu3FromAligned(bla, bla, bla, bla, bla, bla, bla, bla, bla);
}
}
spSu3FromAligned getSpSu3FromAlignedSOA(__global const alignedSpComplex * const restrict in, const size_t i, const size_t stride)
{
return make_spSu3FromAligned(in[0 * stride + i], in[1 * stride + i], in[2 * stride + i],
in[3 * stride + i], in[4 * stride + i], in[5 * stride + i],
in[6 * stride + i], in[7 * stride + i], in[8 * stride + i]);
}
void putSpSu3FromAlignedSOA(__global alignedSpComplex * const restrict out, const size_t i, const spSu3FromAligned val, const size_t stride)
{
out[0 * stride + i] = val.e00;
out[1 * stride + i] = val.e01;
out[2 * stride + i] = val.e02;
out[3 * stride + i] = val.e10;
out[4 * stride + i] = val.e11;
out[5 * stride + i] = val.e12;
out[6 * stride + i] = val.e20;
out[7 * stride + i] = val.e21;
out[8 * stride + i] = val.e22;
}
__kernel void copySpSu3FromAlignedSOARestricted(__global alignedSpComplex * const restrict out, __global const alignedSpComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spSu3FromAligned tmp = getSpSu3FromAlignedSOA(in, i, stride);
putSpSu3FromAlignedSOA(out, i, tmp, stride);
}
}
__kernel void readSpSu3FromAlignedSOARestricted(__global alignedSpComplex * const restrict out, __global const alignedSpComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
alignedSpComplex bla = make_alignedSpComplex(0.0f, 0.0f);
spSu3FromAligned tmp = make_spSu3FromAligned(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSu3FromAlignedAdd(tmp, getSpSu3FromAlignedSOA(in, i, stride));
}
putSpSu3FromAlignedSOA(out, get_global_id(0), tmp, stride);
}
__kernel void writeSpSu3FromAlignedSOARestricted(__global alignedSpComplex * const restrict out, const float in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedSpComplex bla = make_alignedSpComplex(in, in);
spSu3FromAligned tmp = make_spSu3FromAligned(bla, bla, bla, bla, bla, bla, bla, bla, bla);
putSpSu3FromAlignedSOA(out, i, tmp, stride);
}
}
spSu3FromAligned getSpSu3FromAlignedViaLocal(__global const spSu3FromAligned * const restrict in, const size_t block, __local spSu3FromAligned * const restrict scratch)
{
event_t event = async_work_group_copy((__local float2 *) scratch, (__global float2 *) &in[block * get_local_size(0)], get_local_size(0) * 9, 0);
wait_group_events(1, &event);
return scratch[get_local_id(0)];
}
void putSpSu3FromAlignedViaLocal(__global spSu3FromAligned * const restrict out, const size_t block, const spSu3FromAligned val, __local spSu3FromAligned * const restrict scratch)
{
scratch[get_local_id(0)] = val;
barrier(CLK_LOCAL_MEM_FENCE);
event_t event = async_work_group_copy((__global float2 *) &out[block * get_local_size(0)], (__local float2 *) scratch, get_local_size(0) * 9, 0);
wait_group_events(1, &event);
}
__kernel void copySpSu3FromAlignedViaLocalRestricted(__global spSu3FromAligned * const restrict out, __global const spSu3FromAligned * const restrict in, const ulong elems, __local spSu3FromAligned * const restrict scratch)
{
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
spSu3FromAligned tmp = getSpSu3FromAlignedViaLocal(in, i, scratch);
putSpSu3FromAlignedViaLocal(out, i, tmp, scratch);
}
}
__kernel void readSpSu3FromAlignedViaLocalRestricted(__global spSu3FromAligned * const restrict out, __global const spSu3FromAligned * const restrict in, const ulong elems, __local spSu3FromAligned * const restrict scratch)
{
alignedSpComplex bla = make_alignedSpComplex(0.0f, 0.0f);
spSu3FromAligned tmp = make_spSu3FromAligned(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
tmp = spSu3FromAlignedAdd(tmp, getSpSu3FromAlignedViaLocal(in, i, scratch));
}
putSpSu3FromAlignedViaLocal(out, get_global_id(0), tmp, scratch);
}
__kernel void writeSpSu3FromAlignedViaLocalRestricted(__global spSu3FromAligned * const restrict out, const float in, const ulong elems, __local spSu3FromAligned * const restrict scratch)
{
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
alignedSpComplex bla = make_alignedSpComplex(in, in);
spSu3FromAligned tmp = make_spSu3FromAligned(bla, bla, bla, bla, bla, bla, bla, bla, bla);
putSpSu3FromAlignedViaLocal(out, i, tmp, scratch);
}
}
typedef struct {
spComplex e00, e01, e02;
spComplex e10, e11, e12;
spComplex e20, e21, e22;
} __attribute__((aligned(8))) alignedSpSu3;
alignedSpSu3 make_alignedSpSu3(const spComplex e00, const spComplex e01, const spComplex e02,
const spComplex e10, const spComplex e11, const spComplex e12,
const spComplex e20, const spComplex e21, const spComplex e22) {
return (alignedSpSu3) {e00, e01, e02,
e10, e11, e12,
e20, e21, e22};
}
alignedSpSu3 alignedSpSu3Add(const alignedSpSu3 left, const alignedSpSu3 right) {
return make_alignedSpSu3(
spComplexAdd(left.e00, right.e00),
spComplexAdd(left.e01, right.e01),
spComplexAdd(left.e02, right.e02),
spComplexAdd(left.e10, right.e10),
spComplexAdd(left.e11, right.e11),
spComplexAdd(left.e12, right.e12),
spComplexAdd(left.e20, right.e20),
spComplexAdd(left.e21, right.e21),
spComplexAdd(left.e22, right.e22)
);
}
__kernel void copyAligned8SpSu3Restricted(__global alignedSpSu3 * const restrict out, __global const alignedSpSu3 * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAligned8SpSu3Restricted(__global alignedSpSu3 * const restrict out, __global const alignedSpSu3 * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
alignedSpSu3 tmp = make_alignedSpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = alignedSpSu3Add(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAligned8SpSu3Restricted(__global alignedSpSu3 * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
out[i] = make_alignedSpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
}
}
/*
* Single precision spinors
*/
typedef struct {
spSu3vec e0;
spSu3vec e1;
spSu3vec e2;
spSu3vec e3;
} spSpinor;
spSpinor make_spSpinor(const spSu3vec e0, const spSu3vec e1, const spSu3vec e2, const spSu3vec e3) {
return (spSpinor) {e0, e1, e2, e3};
}
spSpinor spSpinorAdd(const spSpinor left, const spSpinor right) {
return make_spSpinor(
spSu3vecAdd(left.e0, right.e0),
spSu3vecAdd(left.e1, right.e1),
spSu3vecAdd(left.e2, right.e2),
spSu3vecAdd(left.e3, right.e3)
);
}
__kernel void copySpSpinor(__global spSpinor * out, __global spSpinor * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readSpSpinor(__global spSpinor * out, __global spSpinor * in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
spSu3vec foo = make_spSu3vec(bla, bla, bla);
spSpinor tmp = make_spSpinor(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSpinorAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeSpSpinor(__global spSpinor * out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
spSu3vec foo = make_spSu3vec(bla, bla, bla);
out[i] = make_spSpinor(foo, foo, foo, foo);
}
}
__kernel void copySpSpinorRestricted(__global spSpinor * const restrict out, __global const spSpinor * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readSpSpinorRestricted(__global spSpinor * const restrict out, __global const spSpinor * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
spSu3vec foo = make_spSu3vec(bla, bla, bla);
spSpinor tmp = make_spSpinor(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSpinorAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeSpSpinorRestricted(__global spSpinor * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
spSu3vec foo = make_spSu3vec(bla, bla, bla);
out[i] = make_spSpinor(foo, foo, foo, foo);
}
}
spSpinor getSpSpinorSOA(__global const spSu3vec * const restrict in, const size_t i, const size_t stride)
{
return make_spSpinor(in[0 * stride + i], in[1 * stride + i], in[2 * stride + i], in[3 * stride + i]);
}
void putSpSpinorSOA(__global spSu3vec * const restrict out, const size_t i, const spSpinor val, const size_t stride)
{
out[0 * stride + i] = val.e0;
out[1 * stride + i] = val.e1;
out[2 * stride + i] = val.e2;
out[3 * stride + i] = val.e3;
}
__kernel void copySpSpinorSOARestricted(__global spSu3vec * const restrict out, __global const spSu3vec * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spSpinor tmp = getSpSpinorSOA(in, i, stride);
putSpSpinorSOA(out, i, tmp, stride);
}
}
__kernel void readSpSpinorSOARestricted(__global spSu3vec * const restrict out, __global const spSu3vec * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
spComplex bla = make_spComplex(0.0f, 0.0f);
spSu3vec foo = make_spSu3vec(bla, bla, bla);
spSpinor tmp = make_spSpinor(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSpinorAdd(tmp, getSpSpinorSOA(in, i, stride));
}
putSpSpinorSOA(out, get_global_id(0), tmp, stride);
}
__kernel void writeSpSpinorSOARestricted(__global spSu3vec * const restrict out, const float in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
spSu3vec foo = make_spSu3vec(bla, bla, bla);
spSpinor tmp = make_spSpinor(foo, foo, foo, foo);
putSpSpinorSOA(out, i, tmp, stride);
}
}
spSpinor getSpSpinorFullSOA(__global const spComplex * const restrict in, const size_t i, const size_t stride)
{
return make_spSpinor(make_spSu3vec(in[0 * stride + i], in[ 1 * stride + i], in[ 2 * stride + i]),
make_spSu3vec(in[3 * stride + i], in[ 4 * stride + i], in[ 5 * stride + i]),
make_spSu3vec(in[6 * stride + i], in[ 7 * stride + i], in[ 8 * stride + i]),
make_spSu3vec(in[9 * stride + i], in[10 * stride + i], in[11 * stride + i]));
}
void putSpSpinorFullSOA(__global spComplex * const restrict out, const size_t i, const spSpinor val, const size_t stride)
{
out[ 0 * stride + i] = val.e0.e0;
out[ 1 * stride + i] = val.e0.e1;
out[ 2 * stride + i] = val.e0.e2;
out[ 3 * stride + i] = val.e1.e0;
out[ 4 * stride + i] = val.e1.e1;
out[ 5 * stride + i] = val.e1.e2;
out[ 6 * stride + i] = val.e2.e0;
out[ 7 * stride + i] = val.e2.e1;
out[ 8 * stride + i] = val.e2.e2;
out[ 9 * stride + i] = val.e3.e0;
out[10 * stride + i] = val.e3.e1;
out[11 * stride + i] = val.e3.e2;
}
__kernel void copySpSpinorFullSOARestricted(__global spComplex * const restrict out, __global const spComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spSpinor tmp = getSpSpinorFullSOA(in, i, stride);
putSpSpinorFullSOA(out, i, tmp, stride);
}
}
__kernel void readSpSpinorFullSOARestricted(__global spComplex * const restrict out, __global const spComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
spComplex bla = make_spComplex(0.0f, 0.0f);
spSu3vec foo = make_spSu3vec(bla, bla, bla);
spSpinor tmp = make_spSpinor(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSpinorAdd(tmp, getSpSpinorFullSOA(in, i, stride));
}
putSpSpinorFullSOA(out, get_global_id(0), tmp, stride);
}
__kernel void writeSpSpinorFullSOARestricted(__global spComplex * const restrict out, const float in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
spSu3vec foo = make_spSu3vec(bla, bla, bla);
spSpinor tmp = make_spSpinor(foo, foo, foo, foo);
putSpSpinorFullSOA(out, i, tmp, stride);
}
}
typedef struct {
spSu3vecFromAligned e0;
spSu3vecFromAligned e1;
spSu3vecFromAligned e2;
spSu3vecFromAligned e3;
} spSpinorFromFromAligned;
spSpinorFromFromAligned make_spSpinorFromFromAligned(const spSu3vecFromAligned e0, const spSu3vecFromAligned e1, const spSu3vecFromAligned e2, const spSu3vecFromAligned e3)
{
return (spSpinorFromFromAligned) {e0, e1, e2, e3};
}
spSpinorFromFromAligned spSpinorFromFromAlignedAdd(const spSpinorFromFromAligned left, const spSpinorFromFromAligned right)
{
return make_spSpinorFromFromAligned(spSu3vecFromAlignedAdd(left.e0, right.e0),
spSu3vecFromAlignedAdd(left.e1, right.e1),
spSu3vecFromAlignedAdd(left.e2, right.e2),
spSu3vecFromAlignedAdd(left.e3, right.e3));
}
spSpinorFromFromAligned getSpSpinorFullAlignedSOA(__global const alignedSpComplex * const restrict in, const size_t i, const size_t stride)
{
return make_spSpinorFromFromAligned(make_spSu3vecFromAligned(in[0 * stride + i], in[ 1 * stride + i], in[ 2 * stride + i]),
make_spSu3vecFromAligned(in[3 * stride + i], in[ 4 * stride + i], in[ 5 * stride + i]),
make_spSu3vecFromAligned(in[6 * stride + i], in[ 7 * stride + i], in[ 8 * stride + i]),
make_spSu3vecFromAligned(in[9 * stride + i], in[10 * stride + i], in[11 * stride + i]));
}
void putSpSpinorFullAlignedSOA(__global alignedSpComplex * const restrict out, const size_t i, const spSpinorFromFromAligned val, const size_t stride)
{
out[ 0 * stride + i] = val.e0.e0;
out[ 1 * stride + i] = val.e0.e1;
out[ 2 * stride + i] = val.e0.e2;
out[ 3 * stride + i] = val.e1.e0;
out[ 4 * stride + i] = val.e1.e1;
out[ 5 * stride + i] = val.e1.e2;
out[ 6 * stride + i] = val.e2.e0;
out[ 7 * stride + i] = val.e2.e1;
out[ 8 * stride + i] = val.e2.e2;
out[ 9 * stride + i] = val.e3.e0;
out[10 * stride + i] = val.e3.e1;
out[11 * stride + i] = val.e3.e2;
}
__kernel void copySpSpinorFullAlignedSOARestricted(__global alignedSpComplex * const restrict out, __global const alignedSpComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spSpinorFromFromAligned tmp = getSpSpinorFullAlignedSOA(in, i, stride);
putSpSpinorFullAlignedSOA(out, i, tmp, stride);
}
}
__kernel void readSpSpinorFullAlignedSOARestricted(__global alignedSpComplex * const restrict out, __global const alignedSpComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
alignedSpComplex bla = make_alignedSpComplex(0.0f, 0.0f);
spSu3vecFromAligned foo = make_spSu3vecFromAligned(bla, bla, bla);
spSpinorFromFromAligned tmp = make_spSpinorFromFromAligned(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSpinorFromFromAlignedAdd(tmp, getSpSpinorFullAlignedSOA(in, i, stride));
}
putSpSpinorFullAlignedSOA(out, get_global_id(0), tmp, stride);
}
__kernel void writeSpSpinorFullAlignedSOARestricted(__global alignedSpComplex * const restrict out, const float in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedSpComplex bla = make_alignedSpComplex(in, in);
spSu3vecFromAligned foo = make_spSu3vecFromAligned(bla, bla, bla);
spSpinorFromFromAligned tmp = make_spSpinorFromFromAligned(foo, foo, foo, foo);
putSpSpinorFullAlignedSOA(out, i, tmp, stride);
}
}
spSpinor getSpSpinorViaLocal(__global const spSpinor * const restrict in, const size_t block, __local spSpinor * const restrict scratch)
{
event_t event = async_work_group_copy((__local float2 *) scratch, (__global float2 *) &in[block * get_local_size(0)], get_local_size(0) * 12, 0);
wait_group_events(1, &event);
return scratch[get_local_id(0)];
}
void putSpSpinorViaLocal(__global spSpinor * const restrict out, const size_t block, const spSpinor val, __local spSpinor * const restrict scratch)
{
scratch[get_local_id(0)] = val;
barrier(CLK_LOCAL_MEM_FENCE);
event_t event = async_work_group_copy((__global float2 *) &out[block * get_local_size(0)], (__local float2 *) scratch, get_local_size(0) * 12, 0);
wait_group_events(1, &event);
}
__kernel void copySpSpinorViaLocalRestricted(__global spSpinor * const restrict out, __global const spSpinor * const restrict in, const ulong elems, __local spSpinor * const restrict scratch)
{
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
spSpinor tmp = getSpSpinorViaLocal(in, i, scratch);
putSpSpinorViaLocal(out, i, tmp, scratch);
}
}
__kernel void readSpSpinorViaLocalRestricted(__global spSpinor * const restrict out, __global const spSpinor * const restrict in, const ulong elems, __local spSpinor * const restrict scratch)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
spSu3vec foo = make_spSu3vec(bla, bla, bla);
spSpinor tmp = make_spSpinor(foo, foo, foo, foo);
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
tmp = spSpinorAdd(tmp, getSpSpinorViaLocal(in, i, scratch));
}
putSpSpinorViaLocal(out, get_global_id(0), tmp, scratch);
}
__kernel void writeSpSpinorViaLocalRestricted(__global spSpinor * const restrict out, const float in, const ulong elems, __local spSpinor * const restrict scratch)
{
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
spComplex bla = make_spComplex(in, in);
spSu3vec foo = make_spSu3vec(bla, bla, bla);
spSpinor tmp = make_spSpinor(foo, foo, foo, foo);
putSpSpinorViaLocal(out, i, tmp, scratch);
}
}
typedef struct {
aligned8SpSu3vec e0;
aligned8SpSu3vec e1;
aligned8SpSu3vec e2;
aligned8SpSu3vec e3;
} spSpinorFromAligned;
spSpinorFromAligned make_spSpinorFromAligned(const aligned8SpSu3vec e0, const aligned8SpSu3vec e1, const aligned8SpSu3vec e2, const aligned8SpSu3vec e3)
{
return (spSpinorFromAligned) {e0, e1, e2, e3};
}
spSpinorFromAligned spSpinorFromAlignedAdd(const spSpinorFromAligned left, const spSpinorFromAligned right) {
return make_spSpinorFromAligned(
aligned8SpSu3vecAdd(left.e0, right.e0),
aligned8SpSu3vecAdd(left.e1, right.e1),
aligned8SpSu3vecAdd(left.e2, right.e2),
aligned8SpSu3vecAdd(left.e3, right.e3)
);
}
__kernel void copySpSpinorFromAlignedRestricted(__global spSpinorFromAligned * const restrict out, __global const spSpinorFromAligned * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readSpSpinorFromAlignedRestricted(__global spSpinorFromAligned * const restrict out, __global const spSpinorFromAligned * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
spSpinorFromAligned tmp = make_spSpinorFromAligned(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSpinorFromAlignedAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeSpSpinorFromAlignedRestricted(__global spSpinorFromAligned * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
out[i] = make_spSpinorFromAligned(foo, foo, foo, foo);
}
}
spSpinorFromAligned getSpSpinorFromAlignedSOA(__global const aligned8SpSu3vec * const restrict in, const size_t i, const size_t stride)
{
return make_spSpinorFromAligned(in[0 * stride + i], in[1 * stride + i], in[2 * stride + i], in[3 * stride + i]);
}
void putSpSpinorFromAlignedSOA(__global aligned8SpSu3vec * const restrict out, const size_t i, const spSpinorFromAligned val, const size_t stride)
{
out[0 * stride + i] = val.e0;
out[1 * stride + i] = val.e1;
out[2 * stride + i] = val.e2;
out[3 * stride + i] = val.e3;
}
__kernel void copySpSpinorFromAlignedSOARestricted(__global aligned8SpSu3vec * const restrict out, __global const aligned8SpSu3vec * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spSpinorFromAligned tmp = getSpSpinorFromAlignedSOA(in, i, stride);
putSpSpinorFromAlignedSOA(out, i, tmp, stride);
}
}
__kernel void readSpSpinorFromAlignedSOARestricted(__global aligned8SpSu3vec * const restrict out, __global const aligned8SpSu3vec * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
spComplex bla = make_spComplex(0.0f, 0.0f);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
spSpinorFromAligned tmp = make_spSpinorFromAligned(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = spSpinorFromAlignedAdd(tmp, getSpSpinorFromAlignedSOA(in, i, stride));
}
putSpSpinorFromAlignedSOA(out, get_global_id(0), tmp, stride);
}
__kernel void writeSpSpinorFromAlignedSOARestricted(__global aligned8SpSu3vec * const restrict out, const float in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
spSpinorFromAligned tmp = make_spSpinorFromAligned(foo, foo, foo, foo);
putSpSpinorFromAlignedSOA(out, i, tmp, stride);
}
}
spSpinorFromAligned getSpSpinorFromAlignedViaLocal(__global const spSpinorFromAligned * const restrict in, const size_t block, __local spSpinorFromAligned * const restrict scratch)
{
event_t event = async_work_group_copy((__local float2 *) scratch, (__global float2 *) &in[block * get_local_size(0)], get_local_size(0) * 12, 0);
wait_group_events(1, &event);
return scratch[get_local_id(0)];
}
void putSpSpinorFromAlignedViaLocal(__global spSpinorFromAligned * const restrict out, const size_t block, const spSpinorFromAligned val, __local spSpinorFromAligned * const restrict scratch)
{
scratch[get_local_id(0)] = val;
barrier(CLK_LOCAL_MEM_FENCE);
event_t event = async_work_group_copy((__global float2 *) &out[block * get_local_size(0)], (__local float2 *) scratch, get_local_size(0) * 12, 0);
wait_group_events(1, &event);
}
__kernel void copySpSpinorFromAlignedViaLocalRestricted(__global spSpinorFromAligned * const restrict out, __global const spSpinorFromAligned * const restrict in, const ulong elems, __local spSpinorFromAligned * const restrict scratch)
{
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
spSpinorFromAligned tmp = getSpSpinorFromAlignedViaLocal(in, i, scratch);
putSpSpinorFromAlignedViaLocal(out, i, tmp, scratch);
}
}
__kernel void readSpSpinorFromAlignedViaLocalRestricted(__global spSpinorFromAligned * const restrict out, __global const spSpinorFromAligned * const restrict in, const ulong elems, __local spSpinorFromAligned * const restrict scratch)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
spSpinorFromAligned tmp = make_spSpinorFromAligned(foo, foo, foo, foo);
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
tmp = spSpinorFromAlignedAdd(tmp, getSpSpinorFromAlignedViaLocal(in, i, scratch));
}
putSpSpinorFromAlignedViaLocal(out, get_global_id(0), tmp, scratch);
}
__kernel void writeSpSpinorFromAlignedViaLocalRestricted(__global spSpinorFromAligned * const restrict out, const float in, const ulong elems, __local spSpinorFromAligned * const restrict scratch)
{
for(size_t i = get_group_id(0); i < elems / get_num_groups(0); i += get_num_groups(0)) {
spComplex bla = make_spComplex(in, in);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
spSpinorFromAligned tmp = make_spSpinorFromAligned(foo, foo, foo, foo);
putSpSpinorFromAlignedViaLocal(out, i, tmp, scratch);
}
}
typedef struct {
aligned8SpSu3vec e0;
aligned8SpSu3vec e1;
aligned8SpSu3vec e2;
aligned8SpSu3vec e3;
} __attribute__((aligned(8))) aligned8SpSpinor;
aligned8SpSpinor make_aligned8SpSpinor(const aligned8SpSu3vec e0, const aligned8SpSu3vec e1, const aligned8SpSu3vec e2, const aligned8SpSu3vec e3) {
return (aligned8SpSpinor) {e0, e1, e2, e3};
}
aligned8SpSpinor aligned8SpSpinorAdd(const aligned8SpSpinor left, const aligned8SpSpinor right) {
return make_aligned8SpSpinor(
aligned8SpSu3vecAdd(left.e0, right.e0),
aligned8SpSu3vecAdd(left.e1, right.e1),
aligned8SpSu3vecAdd(left.e2, right.e2),
aligned8SpSu3vecAdd(left.e3, right.e3)
);
}
__kernel void copyAligned8SpSpinorRestricted(__global aligned8SpSpinor * const restrict out, __global const aligned8SpSpinor * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAligned8SpSpinorRestricted(__global aligned8SpSpinor * const restrict out, __global const aligned8SpSpinor * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
aligned8SpSpinor tmp = make_aligned8SpSpinor(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = aligned8SpSpinorAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAligned8SpSpinorRestricted(__global aligned8SpSpinor * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
out[i] = make_aligned8SpSpinor(foo, foo, foo, foo);
}
}
typedef struct {
aligned8SpSu3vec e0;
aligned8SpSu3vec e1;
aligned8SpSu3vec e2;
aligned8SpSu3vec e3;
} __attribute__((aligned(16))) aligned16SpSpinor;
aligned16SpSpinor make_aligned16SpSpinor(const aligned8SpSu3vec e0, const aligned8SpSu3vec e1, const aligned8SpSu3vec e2, const aligned8SpSu3vec e3) {
return (aligned16SpSpinor) {e0, e1, e2, e3};
}
aligned16SpSpinor aligned16SpSpinorAdd(const aligned16SpSpinor left, const aligned16SpSpinor right) {
return make_aligned16SpSpinor(
aligned8SpSu3vecAdd(left.e0, right.e0),
aligned8SpSu3vecAdd(left.e1, right.e1),
aligned8SpSu3vecAdd(left.e2, right.e2),
aligned8SpSu3vecAdd(left.e3, right.e3)
);
}
__kernel void copyAligned16SpSpinorRestricted(__global aligned16SpSpinor * const restrict out, __global const aligned16SpSpinor * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAligned16SpSpinorRestricted(__global aligned16SpSpinor * const restrict out, __global const aligned16SpSpinor * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
aligned16SpSpinor tmp = make_aligned16SpSpinor(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = aligned16SpSpinorAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAligned16SpSpinorRestricted(__global aligned16SpSpinor * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
out[i] = make_aligned16SpSpinor(foo, foo, foo, foo);
}
}
typedef struct {
aligned8SpSu3vec e0;
aligned8SpSu3vec e1;
aligned8SpSu3vec e2;
aligned8SpSu3vec e3;
} __attribute__((aligned(32))) aligned32SpSpinor;
aligned32SpSpinor make_aligned32SpSpinor(const aligned8SpSu3vec e0, const aligned8SpSu3vec e1, const aligned8SpSu3vec e2, const aligned8SpSu3vec e3) {
return (aligned32SpSpinor) {e0, e1, e2, e3};
}
aligned32SpSpinor aligned32SpSpinorAdd(const aligned32SpSpinor left, const aligned32SpSpinor right) {
return make_aligned32SpSpinor(
aligned8SpSu3vecAdd(left.e0, right.e0),
aligned8SpSu3vecAdd(left.e1, right.e1),
aligned8SpSu3vecAdd(left.e2, right.e2),
aligned8SpSu3vecAdd(left.e3, right.e3)
);
}
__kernel void copyAligned32SpSpinorRestricted(__global aligned32SpSpinor * const restrict out, __global const aligned32SpSpinor * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAligned32SpSpinorRestricted(__global aligned32SpSpinor * const restrict out, __global const aligned32SpSpinor * const restrict in, const ulong elems)
{
spComplex bla = make_spComplex(0.0f, 0.0f);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
aligned32SpSpinor tmp = make_aligned32SpSpinor(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = aligned32SpSpinorAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAligned32SpSpinorRestricted(__global aligned32SpSpinor * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
spComplex bla = make_spComplex(in, in);
aligned8SpSu3vec foo = make_aligned8SpSu3vec(bla, bla, bla);
out[i] = make_aligned32SpSpinor(foo, foo, foo, foo);
}
}
/*
* double kernels
*/
#ifdef DOUBLE_ENABLED
__kernel void copyDouble(__global double * out, __global double * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDouble(__global double * out, __global double * in, const ulong elems)
{
double tmp = 0.0f;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp += in[i];
}
out[get_global_id(0)] = tmp;
}
__kernel void writeDouble(__global double * out, const double in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in;
}
}
__kernel void copyDoubleRestricted(__global double * const restrict out, __global const double * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDoubleRestricted(__global double * const restrict out, __global const double * const restrict in, const ulong elems)
{
double tmp = 0.0f;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp += in[i];
}
out[get_global_id(0)] = tmp;
}
__kernel void writeDoubleRestricted(__global double * const restrict out, const double in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in;
}
}
__kernel void copyDouble2(__global double2 * out, __global double2 * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDouble2(__global double2 * out, __global double2 * in, const ulong elems)
{
double2 tmp = 0.0f;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp += in[i];
}
out[get_global_id(0)] = tmp;
}
__kernel void writeDouble2(__global double2 * out, const double in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in;
}
}
__kernel void copyDouble4(__global double4 * out, __global double4 * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDouble4(__global double4 * out, __global double4 * in, const ulong elems)
{
double4 tmp = 0.0f;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp += in[i];
}
out[get_global_id(0)] = tmp;
}
__kernel void writeDouble4(__global double4 * out, const double in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in;
}
}
/*
* Double precision complex
*/
typedef struct { double re; double im; } dpComplex;
dpComplex make_dpComplex(const double re, const double im) {
return (dpComplex) {re, im};
}
dpComplex dpComplexAdd(const dpComplex left, const dpComplex right) {
return make_dpComplex(left.re + right.re, left.im + right.im);
}
__kernel void copyDpComplex(__global dpComplex * out, __global dpComplex * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDpComplex(__global dpComplex * out, __global dpComplex * in, const ulong elems)
{
dpComplex tmp = make_dpComplex(0.0f, 0.0f);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpComplexAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeDpComplex(__global dpComplex * out, const double in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = make_dpComplex(in, in);
}
}
__kernel void copyDpComplexRestricted(__global dpComplex * const restrict out, __global const dpComplex * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDpComplexRestricted(__global dpComplex * const restrict out, __global const dpComplex * const restrict in, const ulong elems)
{
dpComplex tmp = make_dpComplex(0.0f, 0.0f);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpComplexAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeDpComplexRestricted(__global dpComplex * const restrict out, const double in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = make_dpComplex(in, in);
}
}
typedef struct { double re; double im; } __attribute__((aligned (16))) alignedDpComplex;
alignedDpComplex make_alignedDpComplex(const double re, const double im) {
return (alignedDpComplex) {re, im};
}
alignedDpComplex alignedDpComplexAdd(const alignedDpComplex left, const alignedDpComplex right) {
return make_alignedDpComplex(left.re + right.re, left.im + right.im);
}
__kernel void copyAlignedDpComplex(__global alignedDpComplex * out, __global alignedDpComplex * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAlignedDpComplex(__global alignedDpComplex * out, __global alignedDpComplex * in, const ulong elems)
{
alignedDpComplex tmp = make_alignedDpComplex(0.0f, 0.0f);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = alignedDpComplexAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAlignedDpComplex(__global alignedDpComplex * out, const double in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = make_alignedDpComplex(in, in);
}
}
__kernel void copyAlignedDpComplexRestricted(__global alignedDpComplex * const restrict out, __global const alignedDpComplex * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAlignedDpComplexRestricted(__global alignedDpComplex * const restrict out, __global const alignedDpComplex * const restrict in, const ulong elems)
{
alignedDpComplex tmp = make_alignedDpComplex(0.0f, 0.0f);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = alignedDpComplexAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAlignedDpComplexRestricted(__global alignedDpComplex * const restrict out, const double in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = make_alignedDpComplex(in, in);
}
}
/*
* Double precisoin SU3 vectors
* We always base on aligned types, even if we don't explicitly align the struct
*/
typedef struct {
alignedDpComplex e0;
alignedDpComplex e1;
alignedDpComplex e2;
} dpSu3vec;
dpSu3vec make_dpSu3vec(const alignedDpComplex e0, const alignedDpComplex e1, const alignedDpComplex e2) {
return (dpSu3vec) {e0, e1, e2};
}
dpSu3vec dpSu3vecAdd(const dpSu3vec left, const dpSu3vec right) {
return make_dpSu3vec(
alignedDpComplexAdd(left.e0, right.e0),
alignedDpComplexAdd(left.e1, right.e1),
alignedDpComplexAdd(left.e2, right.e2)
);
}
__kernel void copyDpSu3vec(__global dpSu3vec * out, __global dpSu3vec * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDpSu3vec(__global dpSu3vec * out, __global dpSu3vec * in, const ulong elems)
{
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
dpSu3vec tmp = make_dpSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpSu3vecAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeDpSu3vec(__global dpSu3vec * out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
out[i] = make_dpSu3vec(bla, bla, bla);
}
}
__kernel void copyDpSu3vecRestricted(__global dpSu3vec * const restrict out, __global const dpSu3vec * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDpSu3vecRestricted(__global dpSu3vec * const restrict out, __global const dpSu3vec * const restrict in, const ulong elems)
{
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
dpSu3vec tmp = make_dpSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpSu3vecAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeDpSu3vecRestricted(__global dpSu3vec * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
out[i] = make_dpSu3vec(bla, bla, bla);
}
}
typedef struct {
alignedDpComplex e0;
alignedDpComplex e1;
alignedDpComplex e2;
} __attribute((aligned(16))) aligned16DpSu3vec;
aligned16DpSu3vec make_aligned16DpSu3vec(const alignedDpComplex e0, const alignedDpComplex e1, const alignedDpComplex e2) {
return (aligned16DpSu3vec) {e0, e1, e2};
}
aligned16DpSu3vec aligned16DpSu3vecAdd(const aligned16DpSu3vec left, const aligned16DpSu3vec right) {
return make_aligned16DpSu3vec(
alignedDpComplexAdd(left.e0, right.e0),
alignedDpComplexAdd(left.e1, right.e1),
alignedDpComplexAdd(left.e2, right.e2)
);
}
__kernel void copyAligned16DpSu3vecRestricted(__global aligned16DpSu3vec * const restrict out, __global const aligned16DpSu3vec * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAligned16DpSu3vecRestricted(__global aligned16DpSu3vec * const restrict out, __global const aligned16DpSu3vec * const restrict in, const ulong elems)
{
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
aligned16DpSu3vec tmp = make_aligned16DpSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = aligned16DpSu3vecAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAligned16DpSu3vecRestricted(__global aligned16DpSu3vec * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
out[i] = make_aligned16DpSu3vec(bla, bla, bla);
}
}
typedef struct {
alignedDpComplex e0;
alignedDpComplex e1;
alignedDpComplex e2;
} __attribute((aligned(32))) aligned32DpSu3vec;
aligned32DpSu3vec make_aligned32DpSu3vec(const alignedDpComplex e0, const alignedDpComplex e1, const alignedDpComplex e2) {
return (aligned32DpSu3vec) {e0, e1, e2};
}
aligned32DpSu3vec aligned32DpSu3vecAdd(const aligned32DpSu3vec left, const aligned32DpSu3vec right) {
return make_aligned32DpSu3vec(
alignedDpComplexAdd(left.e0, right.e0),
alignedDpComplexAdd(left.e1, right.e1),
alignedDpComplexAdd(left.e2, right.e2)
);
}
__kernel void copyAligned32DpSu3vecRestricted(__global aligned32DpSu3vec * const restrict out, __global const aligned32DpSu3vec * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAligned32DpSu3vecRestricted(__global aligned32DpSu3vec * const restrict out, __global const aligned32DpSu3vec * const restrict in, const ulong elems)
{
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
aligned32DpSu3vec tmp = make_aligned32DpSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = aligned32DpSu3vecAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAligned32DpSu3vecRestricted(__global aligned32DpSu3vec * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
out[i] = make_aligned32DpSu3vec(bla, bla, bla);
}
}
dpSu3vec getDpSu3vecSOA(__global const alignedDpComplex * const restrict in, const size_t i, const size_t stride)
{
return make_dpSu3vec(in[0 * stride + i], in[ 1 * stride + i], in[ 2 * stride + i]);
}
void putDpSu3vecSOA(__global alignedDpComplex * const restrict out, const size_t i, const dpSu3vec val, const size_t stride)
{
out[ 0 * stride + i] = val.e0;
out[ 1 * stride + i] = val.e1;
out[ 2 * stride + i] = val.e2;
}
__kernel void copyDpSu3vecSOARestricted(__global alignedDpComplex * const restrict out, __global const alignedDpComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
dpSu3vec tmp = getDpSu3vecSOA(in, i, stride);
putDpSu3vecSOA(out, i, tmp, stride);
}
}
__kernel void readDpSu3vecSOARestricted(__global alignedDpComplex * const restrict out, __global const alignedDpComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
dpSu3vec tmp = make_dpSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpSu3vecAdd(tmp, getDpSu3vecSOA(in, i, stride));
}
putDpSu3vecSOA(out, get_global_id(0), tmp, stride);
}
__kernel void writeDpSu3vecSOARestricted(__global alignedDpComplex * const restrict out, const float in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
dpSu3vec tmp = make_dpSu3vec(bla, bla, bla);
putDpSu3vecSOA(out, i, tmp, stride);
}
}
dpSu3vec getDpSu3vecFullSOA(__global const double * const restrict in, const size_t i, const size_t stride)
{
return make_dpSu3vec(make_alignedDpComplex(in[0 * stride + i], in[1 * stride + i]),
make_alignedDpComplex(in[2 * stride + i], in[3 * stride + i]),
make_alignedDpComplex(in[4 * stride + i], in[5 * stride + i]));
}
void putDpSu3vecFullSOA(__global double * const restrict out, const size_t i, const dpSu3vec val, const size_t stride)
{
out[ 0 * stride + i] = val.e0.re;
out[ 1 * stride + i] = val.e0.im;
out[ 2 * stride + i] = val.e1.re;
out[ 3 * stride + i] = val.e1.im;
out[ 4 * stride + i] = val.e2.re;
out[ 5 * stride + i] = val.e2.im;
}
__kernel void copyDpSu3vecFullSOARestricted(__global double * const restrict out, __global const double * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
dpSu3vec tmp = getDpSu3vecFullSOA(in, i, stride);
putDpSu3vecFullSOA(out, i, tmp, stride);
}
}
__kernel void readDpSu3vecFullSOARestricted(__global double * const restrict out, __global const double * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
dpSu3vec tmp = make_dpSu3vec(bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpSu3vecAdd(tmp, getDpSu3vecFullSOA(in, i, stride));
}
putDpSu3vecFullSOA(out, get_global_id(0), tmp, stride);
}
__kernel void writeDpSu3vecFullSOARestricted(__global double * const restrict out, const float in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
dpSu3vec tmp = make_dpSu3vec(bla, bla, bla);
putDpSu3vecFullSOA(out, i, tmp, stride);
}
}
/*
* Double precisoin SU3 matrices
* We always base on aligned types, even if we don't explicitly align the struct
*/
typedef struct {
alignedDpComplex e00, e01, e02;
alignedDpComplex e10, e11, e12;
alignedDpComplex e20, e21, e22;
} dpSu3;
dpSu3 make_dpSu3(const alignedDpComplex e00, const alignedDpComplex e01, const alignedDpComplex e02,
const alignedDpComplex e10, const alignedDpComplex e11, const alignedDpComplex e12,
const alignedDpComplex e20, const alignedDpComplex e21, const alignedDpComplex e22) {
return (dpSu3) {e00, e01, e02,
e10, e11, e12,
e20, e21, e22};
}
dpSu3 dpSu3Add(const dpSu3 left, const dpSu3 right) {
return make_dpSu3(
alignedDpComplexAdd(left.e00, right.e00),
alignedDpComplexAdd(left.e01, right.e01),
alignedDpComplexAdd(left.e02, right.e02),
alignedDpComplexAdd(left.e10, right.e10),
alignedDpComplexAdd(left.e11, right.e11),
alignedDpComplexAdd(left.e12, right.e12),
alignedDpComplexAdd(left.e20, right.e20),
alignedDpComplexAdd(left.e21, right.e21),
alignedDpComplexAdd(left.e22, right.e22)
);
}
__kernel void copyDpSu3(__global dpSu3 * out, __global dpSu3 * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDpSu3(__global dpSu3 * out, __global dpSu3 * in, const ulong elems)
{
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
dpSu3 tmp = make_dpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpSu3Add(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeDpSu3(__global dpSu3 * out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
out[i] = make_dpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
}
}
__kernel void copyDpSu3Restricted(__global dpSu3 * const restrict out, __global const dpSu3 * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDpSu3Restricted(__global dpSu3 * const restrict out, __global const dpSu3 * const restrict in, const ulong elems)
{
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
dpSu3 tmp = make_dpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpSu3Add(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeDpSu3Restricted(__global dpSu3 * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
out[i] = make_dpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
}
}
typedef struct {
alignedDpComplex e00, e01, e02;
alignedDpComplex e10, e11, e12;
alignedDpComplex e20, e21, e22;
} __attribute((aligned(16))) aligned16DpSu3;
aligned16DpSu3 make_aligned16DpSu3(const alignedDpComplex e00, const alignedDpComplex e01, const alignedDpComplex e02,
const alignedDpComplex e10, const alignedDpComplex e11, const alignedDpComplex e12,
const alignedDpComplex e20, const alignedDpComplex e21, const alignedDpComplex e22) {
return (aligned16DpSu3) {e00, e01, e02,
e10, e11, e12,
e20, e21, e22};
}
aligned16DpSu3 aligned16DpSu3Add(const aligned16DpSu3 left, const aligned16DpSu3 right) {
return make_aligned16DpSu3(
alignedDpComplexAdd(left.e00, right.e00),
alignedDpComplexAdd(left.e01, right.e01),
alignedDpComplexAdd(left.e02, right.e02),
alignedDpComplexAdd(left.e10, right.e10),
alignedDpComplexAdd(left.e11, right.e11),
alignedDpComplexAdd(left.e12, right.e12),
alignedDpComplexAdd(left.e20, right.e20),
alignedDpComplexAdd(left.e21, right.e21),
alignedDpComplexAdd(left.e22, right.e22)
);
}
__kernel void copyAligned16DpSu3Restricted(__global aligned16DpSu3 * const restrict out, __global const aligned16DpSu3 * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAligned16DpSu3Restricted(__global aligned16DpSu3 * const restrict out, __global const aligned16DpSu3 * const restrict in, const ulong elems)
{
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
aligned16DpSu3 tmp = make_aligned16DpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = aligned16DpSu3Add(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAligned16DpSu3Restricted(__global aligned16DpSu3 * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
out[i] = make_aligned16DpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
}
}
typedef struct {
alignedDpComplex e00, e01, e02;
alignedDpComplex e10, e11, e12;
alignedDpComplex e20, e21, e22;
} __attribute((aligned(32))) aligned32DpSu3;
aligned32DpSu3 make_aligned32DpSu3(const alignedDpComplex e00, const alignedDpComplex e01, const alignedDpComplex e02,
const alignedDpComplex e10, const alignedDpComplex e11, const alignedDpComplex e12,
const alignedDpComplex e20, const alignedDpComplex e21, const alignedDpComplex e22) {
return (aligned32DpSu3) {e00, e01, e02,
e10, e11, e12,
e20, e21, e22};
}
aligned32DpSu3 aligned32DpSu3Add(const aligned32DpSu3 left, const aligned32DpSu3 right) {
return make_aligned32DpSu3(
alignedDpComplexAdd(left.e00, right.e00),
alignedDpComplexAdd(left.e01, right.e01),
alignedDpComplexAdd(left.e02, right.e02),
alignedDpComplexAdd(left.e10, right.e10),
alignedDpComplexAdd(left.e11, right.e11),
alignedDpComplexAdd(left.e12, right.e12),
alignedDpComplexAdd(left.e20, right.e20),
alignedDpComplexAdd(left.e21, right.e21),
alignedDpComplexAdd(left.e22, right.e22)
);
}
__kernel void copyAligned32DpSu3Restricted(__global aligned32DpSu3 * const restrict out, __global const aligned32DpSu3 * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readAligned32DpSu3Restricted(__global aligned32DpSu3 * const restrict out, __global const aligned32DpSu3 * const restrict in, const ulong elems)
{
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
aligned32DpSu3 tmp = make_aligned32DpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = aligned32DpSu3Add(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeAligned32DpSu3Restricted(__global aligned32DpSu3 * const restrict out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
out[i] = make_aligned32DpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
}
}
dpSu3 getDpSu3SOA(__global const alignedDpComplex * const restrict in, const size_t i, const size_t stride)
{
return make_dpSu3(in[0 * stride + i], in[1 * stride + i], in[2 * stride + i],
in[3 * stride + i], in[4 * stride + i], in[5 * stride + i],
in[6 * stride + i], in[7 * stride + i], in[8 * stride + i]);
}
void putDpSu3SOA(__global alignedDpComplex * const restrict out, const size_t i, const dpSu3 val, const size_t stride)
{
out[0 * stride + i] = val.e00;
out[1 * stride + i] = val.e01;
out[2 * stride + i] = val.e02;
out[3 * stride + i] = val.e10;
out[4 * stride + i] = val.e11;
out[5 * stride + i] = val.e12;
out[6 * stride + i] = val.e20;
out[7 * stride + i] = val.e21;
out[8 * stride + i] = val.e22;
}
__kernel void copyDpSu3SOARestricted(__global alignedDpComplex * const restrict out, __global const alignedDpComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
dpSu3 tmp = getDpSu3SOA(in, i, stride);
putDpSu3SOA(out, i, tmp, stride);
}
}
__kernel void readDpSu3SOARestricted(__global alignedDpComplex * const restrict out, __global const alignedDpComplex * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
dpSu3 tmp = make_dpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpSu3Add(tmp, getDpSu3SOA(in, i, stride));
}
putDpSu3SOA(out, get_global_id(0), tmp, stride);
}
__kernel void writeDpSu3SOARestricted(__global alignedDpComplex * const restrict out, const float in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
dpSu3 tmp = make_dpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
putDpSu3SOA(out, i, tmp, stride);
}
}
dpSu3 getDpSu3FullSOA(__global const double * const restrict in, const size_t i, const size_t stride)
{
return make_dpSu3(make_alignedDpComplex(in[ 0 * stride + i], in[ 1 * stride + i]),
make_alignedDpComplex(in[ 2 * stride + i], in[ 3 * stride + i]),
make_alignedDpComplex(in[ 4 * stride + i], in[ 5 * stride + i]),
make_alignedDpComplex(in[ 6 * stride + i], in[ 7 * stride + i]),
make_alignedDpComplex(in[ 8 * stride + i], in[ 9 * stride + i]),
make_alignedDpComplex(in[10 * stride + i], in[11 * stride + i]),
make_alignedDpComplex(in[12 * stride + i], in[13 * stride + i]),
make_alignedDpComplex(in[14 * stride + i], in[15 * stride + i]),
make_alignedDpComplex(in[16 * stride + i], in[17 * stride + i]));
}
void putDpSu3FullSOA(__global double * const restrict out, const size_t i, const dpSu3 val, const size_t stride)
{
out[ 0 * stride + i] = val.e00.re;
out[ 1 * stride + i] = val.e00.im;
out[ 2 * stride + i] = val.e01.re;
out[ 3 * stride + i] = val.e01.im;
out[ 4 * stride + i] = val.e02.re;
out[ 5 * stride + i] = val.e02.im;
out[ 6 * stride + i] = val.e10.re;
out[ 7 * stride + i] = val.e10.im;
out[ 8 * stride + i] = val.e11.re;
out[ 9 * stride + i] = val.e11.im;
out[10 * stride + i] = val.e12.re;
out[11 * stride + i] = val.e12.im;
out[12 * stride + i] = val.e20.re;
out[13 * stride + i] = val.e20.im;
out[14 * stride + i] = val.e21.re;
out[15 * stride + i] = val.e21.im;
out[16 * stride + i] = val.e22.re;
out[17 * stride + i] = val.e22.im;
}
__kernel void copyDpSu3FullSOARestricted(__global double * const restrict out, __global const double * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
dpSu3 tmp = getDpSu3FullSOA(in, i, elems);
putDpSu3FullSOA(out, i, tmp, stride);
}
}
__kernel void readDpSu3FullSOARestricted(__global double * const restrict out, __global const double * const restrict in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
dpSu3 tmp = make_dpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpSu3Add(tmp, getDpSu3FullSOA(in, i, stride));
}
putDpSu3FullSOA(out, get_global_id(0), tmp, stride);
}
__kernel void writeDpSu3FullSOARestricted(__global double * const restrict out, const float in, const ulong elems, ulong stride)
{
stride = (stride == 0) ? elems : stride;
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
dpSu3 tmp = make_dpSu3(bla, bla, bla, bla, bla, bla, bla, bla, bla);
putDpSu3FullSOA(out, i, tmp, stride);
}
}
/*
* Double precisoin Spinors
* We always base on aligned types, even if we don't explicitly align the struct
*/
typedef struct {
aligned16DpSu3vec e0;
aligned16DpSu3vec e1;
aligned16DpSu3vec e2;
aligned16DpSu3vec e3;
} dpSpinor;
dpSpinor make_dpSpinor(const aligned16DpSu3vec e0, const aligned16DpSu3vec e1, const aligned16DpSu3vec e2, const aligned16DpSu3vec e3)
{
return (dpSpinor) {e0, e1, e2, e3};
}
dpSpinor dpSpinorAdd(const dpSpinor left, const dpSpinor right) {
return make_dpSpinor(
aligned16DpSu3vecAdd(left.e0, right.e0),
aligned16DpSu3vecAdd(left.e1, right.e1),
aligned16DpSu3vecAdd(left.e2, right.e2),
aligned16DpSu3vecAdd(left.e3, right.e3)
);
}
__kernel void copyDpSpinor(__global dpSpinor * out, __global dpSpinor * in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDpSpinor(__global dpSpinor * out, __global dpSpinor * in, const ulong elems)
{
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
aligned16DpSu3vec foo = make_aligned16DpSu3vec(bla, bla, bla);
dpSpinor tmp = make_dpSpinor(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpSpinorAdd(tmp, in[i]);
}
out[get_global_id(0)] = tmp;
}
__kernel void writeDpSpinor(__global dpSpinor * out, const float in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
alignedDpComplex bla = make_alignedDpComplex(in, in);
aligned16DpSu3vec foo = make_aligned16DpSu3vec(bla, bla, bla);
out[i] = make_dpSpinor(foo, foo, foo, foo);
}
}
__kernel void copyDpSpinorRestricted(__global dpSpinor * const restrict out, __global const dpSpinor * const restrict in, const ulong elems)
{
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
out[i] = in[i];
}
}
__kernel void readDpSpinorRestricted(__global dpSpinor * const restrict out, __global const dpSpinor * const restrict in, const ulong elems)
{
alignedDpComplex bla = make_alignedDpComplex(0.0f, 0.0f);
aligned16DpSu3vec foo = make_aligned16DpSu3vec(bla, bla, bla);
dpSpinor tmp = make_dpSpinor(foo, foo, foo, foo);
for(size_t i = get_global_id(0); i < elems; i += get_global_size(0)) {
tmp = dpSpinorAdd(tmp, in[i]);