# Source code for pycbc.filter.simd_correlate

# Copyright (C) 2014 Josh Willis
#
# This program is free software; you can redistribute it and/or modify it
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
from __future__ import absolute_import
from pycbc.types import float32, complex64
import numpy as _np
from .. import opt
from .simd_correlate_cython import ccorrf_simd, ccorrf_parallel

"""
This module interfaces to C functions for multiplying
the complex conjugate of one vector by a second vector, writing the output
to a third vector. They do this multi-threaded and with SIMD vectorization.

The code defined here, and the other calling that function,
are imported and used in the CPUCorrelator class defined in
matchedfilter_cpu.py.

Two functions are defined in the 'support' C/Cython module:

ccorrf_simd: Runs on a single core, but vectorized
ccorrf_parallel: Runs multicore, but not explicitly vectorized.
Parallelized using OpenMP, and calls ccorrf_simd
"""

[docs]def correlate_simd(ht, st, qt):
htilde = _np.array(ht.data, copy=False, dtype=float32)
stilde = _np.array(st.data, copy=False, dtype=float32)
qtilde = _np.array(qt.data, copy=False, dtype=float32)
arrlen = len(htilde)
ccorrf_simd(htilde, stilde, qtilde, arrlen)

# We need a segment size (number of complex elements) such that *three* segments
# of that size will fit in the L2 cache. We also want it to be a power of two.
# We are dealing with single-precision complex numbers, which each require 8 bytes.
#
# Our kernel is written to assume a complex correlation of single-precision vectors,
# so that's all we support here.  Note that we are assuming that the correct target
# is that the vectors should fit in L2 cache.  Figuring out cache topology dynamically
# is a harder problem than we attempt to solve here.

if opt.HAVE_GETCONF:
# Since we need 3 vectors fitting in L2 cache, divide by 3
# We find the nearest power-of-two that fits, and the length
# of the single-precision complex array that fits into that size.
pow2 = int(_np.log(opt.LEVEL2_CACHE_SIZE/3.0)/_np.log(2.0))
default_segsize = pow(2, pow2)/_np.dtype(_np.complex64).itemsize
else:
# Seems to work for Sandy Bridge/Ivy Bridge/Haswell, for now?
default_segsize = 8192

[docs]def correlate_parallel(ht, st, qt):
htilde = _np.array(ht.data, copy=False, dtype=complex64)
stilde = _np.array(st.data, copy=False, dtype=complex64)
qtilde = _np.array(qt.data, copy=False, dtype=complex64)
arrlen = len(htilde)
segsize = default_segsize
ccorrf_parallel(htilde, stilde, qtilde, arrlen, segsize)