#!/usr/bin/python
import os, subprocess, ctypes
from mako.template import Template
full_corr = """
__device__ cufftComplex in_call(void* input, size_t offset,
void* caller_info, void* shared) {
cufftComplex r;
cufftComplex* hp = ((cufftComplex*) callback_params.htilde);
cufftComplex h = hp[offset];
cufftComplex s = ((cufftComplex*) input)[offset];
r.x = h.x * s.x + h.y * s.y;
r.y = h.x * s.y - h.y * s.x;
return r;
}
"""
zero_corr = """
__device__ cufftComplex in_call(void* input, size_t offset,
void* caller_info, void* shared) {
if (offset >= callback_params.in_kmax)
return (cufftComplex){0, 0};
else{
cufftComplex r;
cufftComplex s = ((cufftComplex*) input)[offset];
cufftComplex* hp = ((cufftComplex*) callback_params.htilde);
cufftComplex h = hp[offset];
r.x = h.x * s.x + h.y * s.y;
r.y = h.x * s.y - h.y * s.x;
return r;
}
}
"""
zero_out = """
__device__ void out_call(void *out, size_t offset, cufftComplex element,
void *caller_info, void *shared){
if (offset > callback_params.out_kmin && offset < callback_params.out_kmax)
((cufftComplex*) out)[offset] = element;
}
"""
copy_callback = """
__device__ cufftComplex correlate(void* input, size_t offset,
void* caller_info, void* shared) {
return ((cufftComplex*)input)[offset];
}
"""
copy_out = """
__device__ void out_call(void *out, size_t offset, cufftComplex element,
void *caller_info, void *shared){
((cufftComplex*) out)[offset] = element;
}
"""
real_out = """
__device__ void out_call(void *out, size_t offset, cufftComplex element,
void *caller_info, void *shared){
((cufftReal*) out)[offset] = element.x * element.x + element.y * element.y;
}
"""
copy_out_fp16 = """
__device__ void out_call(void *out, size_t offset, cufftComplex element,
void *caller_info, void *shared){
((short*) out)[offset*2] = __float2half_rn(element.x);
((short*) out)[offset*2+1] = __float2half_rn(element.y);
}
"""
no_out = """
__device__ void out_call(void *out, size_t offset, cufftComplex element,
void *caller_info, void *shared){
}
"""
fftsrc = Template("""
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
typedef struct {
%for t, n in parameters:
${t} ${n};
%endfor
} param_t;
__constant__ param_t callback_params;
#define checkCudaErrors(val) __checkCudaErrors__ ( (val), #val, __FILE__, __LINE__ )
template <typename T>
inline void __checkCudaErrors__(T code, const char *func, const char *file, int line)
{
if (code) {
fprintf(stderr, "CUDA error at %s:%d code=%d \\"%s\\" \\n",
file, line, (unsigned int)code, func);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}
% if input_callback:
${input_callback}
__device__ cufftCallbackLoadC input_callback = in_call;
% endif
% if output_callback:
${output_callback}
__device__ cufftCallbackStoreC output_callback = out_call;
% endif
extern "C" cufftHandle* create_plan(unsigned int size){
cufftHandle* plan = new cufftHandle;
size_t work_size;
cufftCreate(plan);
checkCudaErrors(cufftMakePlan1d(*plan, size, CUFFT_C2C, 1, &work_size));
% if input_callback:
cufftCallbackLoadC h_input_callback;
checkCudaErrors(cudaMemcpyFromSymbol(&h_input_callback, input_callback,
sizeof(h_input_callback)));
checkCudaErrors(cufftXtSetCallback(*plan, (void **) &h_input_callback,
CUFFT_CB_LD_COMPLEX, 0));
% endif
% if output_callback:
cufftCallbackStoreC h_output_callback;
checkCudaErrors(cudaMemcpyFromSymbol(&h_output_callback, output_callback,
sizeof(h_output_callback)));
checkCudaErrors(cufftXtSetCallback(*plan, (void **) &h_output_callback,
CUFFT_CB_ST_COMPLEX, 0));
% endif
return plan;
}
extern "C" void execute(cufftHandle* plan, cufftComplex* in, cufftComplex* out, param_t* p){
if (p != NULL)
checkCudaErrors(cudaMemcpyToSymbolAsync(callback_params, p, sizeof(param_t), 0, cudaMemcpyHostToDevice, 0));
checkCudaErrors(cufftExecC2C(*plan, in, out, CUFFT_INVERSE));
}
""")
[docs]
def compile(source, name):
""" Compile the string source code into a shared object linked against
the static version of cufft for callback support.
"""
# If we start using this again, we should find a better place for the cache
cache = os.path.join('/tmp', name)
hash_file = cache + ".hash"
lib_file = cache + ".so"
obj_file = cache + ".o"
try:
if int(open(hash_file, "r").read()) == hash(source):
return lib_file
raise ValueError
except:
pass
src_file = cache + ".cu"
fsrc = open(src_file, "w")
fsrc.write(source)
fsrc.close()
cmd = ["nvcc", "-ccbin", "g++", "-dc", "-m64",
"--compiler-options", "'-fPIC'",
"-o", obj_file,
"-c", src_file]
print(" ".join(cmd))
subprocess.check_call(cmd)
cmd = ["nvcc", "-shared", "-ccbin", "g++", "-m64",
"-o", lib_file, obj_file, "-lcufft_static", "-lculibos"]
print(" ".join(cmd))
subprocess.check_call(cmd)
hash_file = cache + ".hash"
fhash = open(hash_file, "w")
fhash.write(str(hash(source)))
return lib_file
[docs]
def get_fn_plan(callback=None, out_callback=None, name='pycbc_cufft', parameters=None):
""" Get the IFFT execute and plan functions
"""
if parameters is None:
parameters = []
source = fftsrc.render(input_callback=callback, output_callback=out_callback, parameters=parameters)
path = compile(source, name)
lib = ctypes.cdll.LoadLibrary(path)
fn = lib.execute
fn.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
plan = lib.create_plan
plan.restype = ctypes.c_void_p
plan.argyptes = [ctypes.c_uint]
return fn, plan
_plans = {}
[docs]
class param(ctypes.Structure):
_fields_ = [("htilde", ctypes.c_void_p)]
hparam = param()
[docs]
def c2c_correlate_ifft(htilde, stilde, outvec):
key = 'cnf'
if key not in _plans:
fn, pfn = get_fn_plan(callback=full_corr, parameters = [("void*", "htilde")])
plan = pfn(len(outvec), int(htilde.data.gpudata))
_plans[key] = (fn, plan, int(htilde.data.gpudata))
fn, plan, _ = _plans[key]
hparam.htilde = htilde.data.gpudata
fn(plan, int(stilde.data.gpudata), int(outvec.data.gpudata), ctypes.pointer(hparam))
[docs]
class param2(ctypes.Structure):
_fields_ = [("htilde", ctypes.c_void_p),
("in_kmax", ctypes.c_uint),
("out_kmin", ctypes.c_uint),
("out_kmax", ctypes.c_uint)]
hparam_zeros = param2()
[docs]
def c2c_half_correlate_ifft(htilde, stilde, outvec):
key = 'cn'
if key not in _plans:
fn, pfn = get_fn_plan(callback=zero_corr,
parameters = [("void*", "htilde"),
("unsigned int", "in_kmax"),
("unsigned int", "out_kmin"),
("unsigned int", "out_kmax")],
out_callback=zero_out)
plan = pfn(len(outvec), int(htilde.data.gpudata))
_plans[key] = (fn, plan, int(htilde.data.gpudata))
fn, plan, _ = _plans[key]
hparam_zeros.htilde = htilde.data.gpudata
hparam_zeros.in_kmax = htilde.end_idx
hparam_zeros.out_kmin = stilde.analyze.start
hparam_zeros.out_kmax = stilde.analyze.stop
fn(plan, int(stilde.data.gpudata), int(outvec.data.gpudata), ctypes.pointer(hparam_zeros))