Source code for pycbc.events.threshold_cupy

# Copyright (C) 2012  Alex Nitz
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


#
# =============================================================================
#
#                                   Preamble
#
# =============================================================================
#

import cupy as cp
import functools
import mako.template
from .eventmgr import _BaseThresholdCluster

val = None
loc = None

# https://stackoverflow.com/questions/77798014/cupy-rawkernel-cuda-error-not-found-named-symbol-not-found-cupy

tkernel1 = mako.template.Template("""
extern "C" __global__ void threshold_and_cluster(float2* in, float2* outv, int* outl, int window, float threshold){
    int s = window * blockIdx.x;
    int e = s + window;

    // shared memory for chuck size candidates
    __shared__ float svr[${chunk}];
    __shared__ float svi[${chunk}];
    __shared__ int sl[${chunk}];

    // shared memory for the warp size candidates
    __shared__ float svv[32];
    __shared__ int idx[32];

    int ml = -1;
    float mvr = 0;
    float mvi = 0;
    float re;
    float im;

    // Iterate trought the entire window size chunk and find blockDim.x number
    // of candidates
    for (int i = s + threadIdx.x; i < e; i += blockDim.x){
        re = in[i].x;
        im = in[i].y;
        if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){
            mvr = re;
            mvi = im;
            ml = i;
        }
    }

    // Save the candidate from this thread to shared memory
    svr[threadIdx.x] = mvr;
    svi[threadIdx.x] = mvi;
    sl[threadIdx.x] = ml;

    __syncthreads();

    if (threadIdx.x < 32){
        int tl = threadIdx.x;

        // Now that we have all the candiates for this chunk in shared memory
        // Iterate through in the warp size to reduce to 32 candidates
        for (int i = threadIdx.x; i < ${chunk}; i += 32){
            re = svr[i];
            im = svi[i];
            if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){
                tl = i;
                mvr = re;
                mvi = im;
            }
        }

        // Store the 32 candidates into shared memory
        svv[threadIdx.x] = svr[tl] * svr[tl] + svi[tl] * svi[tl];
        idx[threadIdx.x] = tl;

        // Find the 1 candidate we are looking for using a manual log algorithm
        if ((threadIdx.x < 16) && (svv[threadIdx.x] < svv[threadIdx.x + 16])){
            svv[threadIdx.x] = svv[threadIdx.x + 16];
            idx[threadIdx.x] = idx[threadIdx.x + 16];
        }

        if ((threadIdx.x < 8) && (svv[threadIdx.x] < svv[threadIdx.x + 8])){
            svv[threadIdx.x] = svv[threadIdx.x + 8];
            idx[threadIdx.x] = idx[threadIdx.x + 8];
        }

        if ((threadIdx.x < 4) && (svv[threadIdx.x] < svv[threadIdx.x + 4])){
            svv[threadIdx.x] = svv[threadIdx.x + 4];
            idx[threadIdx.x] = idx[threadIdx.x + 4];
        }

        if ((threadIdx.x < 2) && (svv[threadIdx.x] < svv[threadIdx.x + 2])){
            svv[threadIdx.x] = svv[threadIdx.x + 2];
            idx[threadIdx.x] = idx[threadIdx.x + 2];
        }


        // Save the 1 candidate maximum and location to the output vectors
        if (threadIdx.x == 0){
            if (svv[threadIdx.x] < svv[threadIdx.x + 1]){
                idx[0] = idx[1];
                svv[0] = svv[1];
            }

            if (svv[0] > threshold){
                tl = idx[0];
                outv[blockIdx.x].x = svr[tl];
                outv[blockIdx.x].y = svi[tl];
                outl[blockIdx.x] = sl[tl];
            } else{
                outl[blockIdx.x] = -1;
            }
        }
    }
}
""")

tkernel2 = mako.template.Template("""
extern "C" __global__ void threshold_and_cluster2(float2* outv, int* outl, float threshold, int window){
    __shared__ int loc[${blocks}];
    __shared__ float val[${blocks}];

    int i = threadIdx.x;

    int l = outl[i];
    loc[i] = l;

    if (l == -1)
        return;

    val[i] = outv[i].x * outv[i].x + outv[i].y * outv[i].y;


    // Check right
    if ( (i < (${blocks} - 1)) && (val[i + 1] > val[i]) ){
        outl[i] = -1;
        return;
    }

    // Check left
    if ( (i > 0) && (val[i - 1] > val[i]) ){
        outl[i] = -1;
        return;
    }
}
""")

[docs] @functools.lru_cache(maxsize=None) def get_tkernel(slen, window): if window < 32: raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples") elif window <= 4096: nt = 128 elif window <= 16384: nt = 256 elif window <= 32768: nt = 512 else: nt = 1024 nb = int(cp.ceil(slen / float(window))) if nb > 1024: raise ValueError("More than 1024 blocks not supported yet") fn = cp.RawKernel( tkernel1.render(chunk=nt), 'threshold_and_cluster', backend='nvcc' ) fn2 = cp.RawKernel( tkernel2.render(blocks=nb), 'threshold_and_cluster2', backend='nvcc' ) return (fn, fn2), nt, nb
[docs] def threshold_and_cluster(series, threshold, window): global val global loc if val is None: val = cp.zeros(4096*256, dtype=cp.complex64) if loc is None: loc = cp.zeros(4096*256, cp.int32) outl = loc outv = val slen = len(series) series = series.data (fn, fn2), nt, nb = get_tkernel(slen, window) threshold = cp.float32(threshold * threshold) window = cp.int32(window) cl = loc[0:nb] cv = val[0:nb] fn((nb,), (nt,), (series.data, outv, outl, window, threshold)) fn2((1,), (nb,), (outv, outl, threshold, window)) w = (cl != -1) return cv[w], cl[w]
[docs] class CUDAThresholdCluster(_BaseThresholdCluster): def __init__(self, series): self.series = series global val global loc if val is None: val = cp.zeros(4096*256, dtype=cp.complex64) if loc is None: loc = cp.zeros(4096*256, cp.int32) self.outl = loc self.outv = val self.slen = len(series)
[docs] def threshold_and_cluster(self, threshold, window): threshold = cp.float32(threshold * threshold) window = cp.int32(window) (fn, fn2), nt, nb = get_tkernel(self.slen, window) cl = loc[0:nb] cv = val[0:nb] fn( (nt, 1, 1), (nb, 1), (self.series.data, self.outv, self.outl, window, threshold) ) fn2( (nb, 1, 1), (1, 1), (self.outv, self.outl, threshold, window) ) w = (cl != -1) return cp.asnumpy(cv[w]), cp.asnumpy(cl[w])
def _threshold_cluster_factory(series): return CUDAThresholdCluster