# Copyright (C) 2014 Josh Willis
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from __future__ import absolute_import
from pycbc.types import float32, complex64
import numpy as _np
from .. import opt
from .simd_correlate_cython import ccorrf_simd, ccorrf_parallel
This module interfaces to C functions for multiplying
the complex conjugate of one vector by a second vector, writing the output
to a third vector. They do this multi-threaded and with SIMD vectorization.
The code defined here, and the other calling that function,
are imported and used in the CPUCorrelator class defined in
Two functions are defined in the 'support' C/Cython module:
ccorrf_simd: Runs on a single core, but vectorized
ccorrf_parallel: Runs multicore, but not explicitly vectorized.
Parallelized using OpenMP, and calls ccorrf_simd
[docs]def correlate_simd(ht, st, qt):
htilde = _np.array(ht.data, copy=False, dtype=float32)
stilde = _np.array(st.data, copy=False, dtype=float32)
qtilde = _np.array(qt.data, copy=False, dtype=float32)
arrlen = len(htilde)
ccorrf_simd(htilde, stilde, qtilde, arrlen)
# We need a segment size (number of complex elements) such that *three* segments
# of that size will fit in the L2 cache. We also want it to be a power of two.
# We are dealing with single-precision complex numbers, which each require 8 bytes.
# Our kernel is written to assume a complex correlation of single-precision vectors,
# so that's all we support here. Note that we are assuming that the correct target
# is that the vectors should fit in L2 cache. Figuring out cache topology dynamically
# is a harder problem than we attempt to solve here.
# Since we need 3 vectors fitting in L2 cache, divide by 3
# We find the nearest power-of-two that fits, and the length
# of the single-precision complex array that fits into that size.
pow2 = int(_np.log(opt.LEVEL2_CACHE_SIZE/3.0)/_np.log(2.0))
default_segsize = pow(2, pow2)/_np.dtype(_np.complex64).itemsize
# Seems to work for Sandy Bridge/Ivy Bridge/Haswell, for now?
default_segsize = 8192
[docs]def correlate_parallel(ht, st, qt):
htilde = _np.array(ht.data, copy=False, dtype=complex64)
stilde = _np.array(st.data, copy=False, dtype=complex64)
qtilde = _np.array(qt.data, copy=False, dtype=complex64)
arrlen = len(htilde)
segsize = default_segsize
ccorrf_parallel(htilde, stilde, qtilde, arrlen, segsize)