IPP FFT called from C# does not scale with additional CPU cores

Cross-posting from http://stackoverflow.com/q/15836542/90475

Here's my code... I intend to run IPP FFT over millions of 1D traces by calling Correlate(): **performance matters**.

In Visual Studio profiler, I can see IPP DFT calls taking a lion's share of time when running with a low number of processes.

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;

    using ipp;

    namespace GregsFavNamespace
    {
       public class Correlation
       {
          readonly int _traceSampleCount, _sizeDftSpec, _sizeDftInitBuf, _sizeDftWorkBuf;
          readonly byte[] _dftSpecBuf, _workBuffer;

          readonly Ipp32fc[] _sweep_spectral_conjugate, _interleavedData_temporal, _interleavedData_spectral, _interleavedDataWithSweep_spectral, _interleavedData_temporalResult;

          public Correlation(float[] sweep, int sweepSampleCount, int traceSampleCount, bool shouldNormalizeData = false, int sweepDelay = 0)
          {
             _traceSampleCount = traceSampleCount;

             // Greg Chernis: the following initialization code is based on
             //    http://software.intel.com/en-us/articles/how-to-use-intel-ipp-s-1d-fouri...

             int ippDivisionAlgorithm = (int)FftDivisionFlag.IPP_FFT_DIV_INV_BY_N; // alternatively, use IPP_FFT_NODIV_BY_ANY
             IppHintAlgorithm ippPerformanceHint = IppHintAlgorithm.ippAlgHintAccurate; // alternatively, use ippAlgHintFast

             unsafe
             {
                IppStatus result;
                fixed (int* p_sizeDftSpec = &_sizeDftSpec, p_sizeDftInitBuf = &_sizeDftInitBuf, p_sizeDftWorkBuf = &_sizeDftWorkBuf)
                   result = sp.ippsDFTGetSize_C_32fc(traceSampleCount, ippDivisionAlgorithm, ippPerformanceHint,
                      p_sizeDftSpec, p_sizeDftInitBuf, p_sizeDftWorkBuf);

                if (result != IppStatus.ippStsNoErr)
                   throw new ApplicationException(result.ToString());
             }

             byte[] dftInitBuf = new byte[_sizeDftInitBuf];
             _dftSpecBuf = new byte[_sizeDftSpec];
             _workBuffer = new byte[_sizeDftWorkBuf];

             unsafe
             {
                IppStatus result;
                fixed (byte* p_dftInitBuf = dftInitBuf)
                fixed (byte* p_dftSpecBuf = _dftSpecBuf)
                {
                   var p_dftSpec = (IppsDFTSpec_C_32fc*)p_dftSpecBuf;
                   result = sp.ippsDFTInit_C_32fc(traceSampleCount, ippDivisionAlgorithm, ippPerformanceHint,
                      p_dftSpec, p_dftInitBuf);
                }
                if (result != IppStatus.ippStsNoErr)
                   throw new ApplicationException(result.ToString());
             }

             // Compute this sweep transformation once and re-use it as many times as we have traces to process
             _sweep_spectral_conjugate = PrepareSweepForCorrelation(sweep, sweepSampleCount, sweepDelay);

             // Pre-allocate these buffers, as they are frequently re-used: as many times as we have traces to process
             _interleavedData_temporal          = new Ipp32fc[traceSampleCount];
             _interleavedData_spectral          = new Ipp32fc[traceSampleCount];
             _interleavedDataWithSweep_spectral = new Ipp32fc[traceSampleCount];
             _interleavedData_temporalResult    = new Ipp32fc[traceSampleCount];
          }

          private static float CalculateNormalizationFactor(float[] trace1, int traceSampleCount)
          {
             double normalizationFactor = 0.0;
             for (uint i = 0; i < traceSampleCount; i++)
                normalizationFactor += trace1[i] * trace1[i];

             if (normalizationFactor == 0.0)
                return 1.0f;

             return (float)(1.0 / Math.Sqrt(normalizationFactor));
          }

          private Ipp32fc[] PrepareSweepForCorrelation(float[] sweep, int sweepSampleCount, int sweepDelay)
          {
             var sweep_temporal = new Ipp32fc[_traceSampleCount];
             ApplySweepDelay(sweep, sweep_temporal, sweepSampleCount, sweepDelay);
             return ComputeConjugateOfSweepSpectra(sweep_temporal, _traceSampleCount);
          }

          private static void ApplySweepDelay(float[] sweepSrc, Ipp32fc[] sweepDst, int sweepSampleCount, int delay)
          {
             if (delay > 0)
                interleave4(sweepSrc.Skip(delay).ToArray(), null, sweepDst, sweepSampleCount - delay);
             else if (delay < 0)
                interleave4(sweepSrc, null, sweepDst.Skip(delay).ToArray(), sweepSampleCount - delay);
             else if (delay == 0)
                interleave4(sweepSrc, null, sweepDst, sweepSampleCount);
          }

          private unsafe Ipp32fc[] ComputeConjugateOfSweepSpectra(Ipp32fc[] sweep_temporal, int sweepSampleCount)
          {
             var sweep_spectral = new Ipp32fc[sweepSampleCount];
             var sweep_spectral_conjugate = new Ipp32fc[sweepSampleCount];

             fixed (Ipp32fc*
                p_sweep_temporal = sweep_temporal,
                p_sweep_spectral = sweep_spectral,
                p_sweep_spectral_conjugate = sweep_spectral_conjugate)
             {
                fixed (byte* p_workBuffer = _workBuffer)
                fixed (byte* p_dftSpecBuf = _dftSpecBuf)
                {
                   var p_dftSpec = (IppsDFTSpec_C_32fc*)p_dftSpecBuf;
                   // Forward Fourier to spectra domain
                   sp.ippsDFTFwd_CToC_32fc(p_sweep_temporal, p_sweep_spectral, p_dftSpec, p_workBuffer);
                }

                // Conjugate
                sp.ippsConj_32fc(p_sweep_spectral, p_sweep_spectral_conjugate, sweepSampleCount);
             }

             return sweep_spectral_conjugate;
          }

          public void Correlate(float[] trace1, float[] trace2, float[] trace1Result, float[] trace2Result)
          {
             // Blend traces 1 and 2 into _interleavedData_temporal (array of complex numbers for Complex-to-Complex Fourier)
             interleave4(trace1, trace2, _interleavedData_temporal, _traceSampleCount);

             CorrelateBothTraces();

                // Get back correlated traces 1 and 2 from _interleavedData_spectral (array of complex numbers for Complex-to-Complex DFT)
                uninterleave4(_interleavedData_temporalResult, trace1Result, trace2Result, _traceSampleCount);
          }

          unsafe private void CorrelateBothTraces()
          {
             // GC: performance is critical for the following three unmanaged calls:
             //   all arrays should be preallocated upfront

             fixed (Ipp32fc* p_sweep_spectral_conjugate = _sweep_spectral_conjugate,
                p_interleavedData_temporal = _interleavedData_temporal,
                p_interleavedData_spectral = _interleavedData_spectral,
                p_interleavedDataWithSweep_spectral = _interleavedDataWithSweep_spectral,
                p_interleavedData_temporalResult = _interleavedData_temporalResult)
             fixed (byte* p_workBuffer = _workBuffer, p_dftSpecBuf = _dftSpecBuf)
             {
                var p_dftSpec = (IppsDFTSpec_C_32fc*)p_dftSpecBuf;

                // Forward Fourier to spectra domain
                sp.ippsDFTFwd_CToC_32fc(p_interleavedData_temporal, p_interleavedData_spectral, p_dftSpec, p_workBuffer);

                // Complex multiply conjugate of sweep spectra by blended trace data spectra
                sp.ippsMul_32fc(p_sweep_spectral_conjugate, p_interleavedData_spectral, p_interleavedDataWithSweep_spectral, _traceSampleCount);

                // Inverse Fourier to time domain, correlated
                sp.ippsDFTInv_CToC_32fc(p_interleavedDataWithSweep_spectral, p_interleavedData_temporalResult, p_dftSpec, p_workBuffer);
             }
          }

          static void interleave4(float[] reals, float[] imaginaries, Ipp32fc[] complexOutput, int sampleCount)
          {
             int i = 0;

             if (reals != null && imaginaries != null)
             {
                for (i = 0; i < sampleCount; i++)
                {
                   complexOutput[i].re = reals[i];
                   complexOutput[i].im = imaginaries[i];
                }
             }
             else if (reals != null)
             {
                for (i = 0; i < sampleCount; ++i)
                {
                   complexOutput[i].re = reals[i];
                   complexOutput[i].im = 0;
                }
             }
             else if (imaginaries != null)
             {
                for (i = 0; i < sampleCount; ++i)
                {
                   complexOutput[i].re = 0;
                   complexOutput[i].im = imaginaries[i];
                }
             }
          }

          static void uninterleave4(Ipp32fc[] complex, float[] realsOut, float[] imaginariesOut, int sampleCount)
          {
             if (realsOut != null && imaginariesOut != null)
             {
                for (int i = 0; i < sampleCount; i++)
                {
                   realsOut[i] = complex[i].re;
                   imaginariesOut[i] = complex[i].im;
                }
             }
             else if (realsOut != null)
             {
                for (int i = 0; i < sampleCount; i++)
                   realsOut[i] = complex[i].re;
             }
             else if (imaginariesOut != null)
             {
                for (int i = 0; i < sampleCount; i++)
                   imaginariesOut[i] = complex[i].im;
             }
          }
       }
    }

In Intel's IPP samples, there are generated P/Invoke signatures that I use to call the code from C# (I modified them slightly, but they still work):

    using System;
    using System.Security;
    using System.Runtime.InteropServices;

    namespace ipp {
       public enum IppStatus { }

       [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi)]
       [DebuggerDisplay("({re},{im})")]
       public struct Ipp32fc
       {
          public float re;
          public float im;
          public Ipp32fc(float re, float im)
          {
             this.re = re;
             this.im = im;
          }
       };
       [SuppressUnmanagedCodeSecurityAttribute()]
       [DllImport(ipp.sp.libname)]
       public static extern
          IppStatus ippsDFTGetSize_C_32fc(int length, int flag, IppHintAlgorithm hint, int* pSpecSize, int* pSpecBufferSize, int* pBufferSize);

       [SuppressUnmanagedCodeSecurityAttribute()]
       [DllImport(ipp.sp.libname)]
       public static extern
          IppStatus ippsDFTInit_C_32fc(int length, int flag, IppHintAlgorithm hint, IppsDFTSpec_C_32fc* pSpec, byte* pSpecBuffer);

       [SuppressUnmanagedCodeSecurityAttribute()]
       [DllImport(ipp.sp.libname)] public static extern
       IppStatus ippsDFTFwd_CToC_32fc ( Ipp32fc *pSrc, Ipp32fc *pDst, IppsDFTSpec_C_32fc *pDFTSpec, byte *pBuffer );

       [SuppressUnmanagedCodeSecurityAttribute()]
       [DllImport(ipp.sp.libname)] public static extern
       IppStatus ippsDFTFwd_CToC_32fc ( Ipp32fc *pSrc, Ipp32fc *pDst, IppsDFTSpec_C_32fc *pDFTSpec, byte *pBuffer );

       [SuppressUnmanagedCodeSecurityAttribute()]
       [DllImport(ipp.sp.libname)] public static extern
       IppStatus ippsMul_32fc ( Ipp32fc *pSrc1, Ipp32fc *pSrc2, Ipp32fc *pDst, int len );

       [SuppressUnmanagedCodeSecurityAttribute()]
       [DllImport(ipp.sp.libname)] public static extern
       IppStatus ippsDFTInv_CToC_32fc ( Ipp32fc *pSrc, Ipp32fc *pDst, IppsDFTSpec_C_32fc *pDFTSpec, byte *pBuffer );
    }

Here are my run times with a small dataset (keeping the trace size such that working set is 128K):

    Process count | CPU utilization | Wall time to complete (seconds)
    2 10 90.8
    4 20 48.6
    6 25 37.1
    8 30 31.1
    10 40 28.6
    12 50 27.0 <-- perf should not improve past this point due to 12 floating point units in this machine, but IMHO it should scale better
    14 55 27.0
    16 60 27.6

.NET GC load is reasonable (less than 10% always, 0-1% most of the time).

I see a lot of calls to IppZero_8u on the top of call stack as it runs

I see a lot of [Transition from Managed To Unmanaged]: I plan to switch to C++/CLI in hopes of addressing this issue.

I suspect that IppZero_8u hinders scalability, as it (possibly) tries to access main memory, and is having to synchronize between cores.

How can I pinpoint what prevents scaling? What would you recommend to improve scalability of the algorithm?

----------

Details

- This was performed on dual Xeon 5650 with 6 cores each (6 floating point processors x 2). Intel IPP library correctly detects SSE 4.2 on this processor configuration

- This is an asymmetric machine with two memory nodes, 96GB total DDR3 RAM

- 32K L1 and 256K L2, 12 MB shared L3

- I have tried in-place FFT instead of out-of-place DFT. It scales even less, taking the same amount of time with 6 processes as it does with 2: 40 seconds

- I/O is not a bottleneck, since the same work finishes in 6 seconds without involving correlation

- Intel Integrated Performance Primitives 7.1

- .NET 4.5 is running with Server GC enabled

- I prefer to do this high-level orchestration from a managed language to avoid instability due to memory buffer overruns (fear of clumsy programming and insufficient testing)

IPP FFT called from C# does not scale with additional CPU cores

Trending Articles

Practice Sheet of Right form of verbs for HSC Students

Download: FK ft Shenky – Nakuyewa ”Prod by: Shenky”

How to win at Markstrat (Markstrat Tips and Tricks) – Vodites

Ominde Commission Report and Recommendations – Ominde Report of 1964

Bureau of Internal Revenue: Regional Offices (Directory)

GO 53 on Enhancement of Ex-gratia upto 5 Lakhs Toddy Tappers in Telangana

Cakewalk CA-2A Leveling Amplifier v2.0.1.97 WiN, v2.0.1.96 OSX Incl Keygen

Mp3 Download: Mdu - Kunjenjenjena

How the kill the job , when DTP request running for long hours.

Microsoft Intune から展開しているアプリのアップデートについて

18-year-old girl was beaten for half an hour by two Northampton men in 'an...

Car crash in Dunton Bassett leaves driver in critical condition

Macky 2, Two Others In Road Accident

Application log 00000000000000089514: Could not convert queue DLVST90CLNT

Detroit mafia: D’Anna Brothers agree to plea deal

Delivery block field greyed out using VA02

Muloraki Au

【個人撮影】スマホのプライベート映像♪「中に出さないで///」カラオケ屋での生ハメ撮りが流出ｗ【リベンジポルノ】＠PornHub

BREAKING NEWS: Diamond Platnumz Is Reported Dead After Ghastly Car Accident

FIAT 500 B0111 B0112