I am testing the convolution using IPP.
I also tested using MKL, but it was not as fast as I thought. However, when implemented simply with IPP, incorrect results were output.
I would like the result of the link below, but please review what's wrong.
const int out_width = 5; const int out_height = 9; IppStatus status = ippStsNoErr; Ipp16s* pSrc1 = NULL, *pSrc2 = NULL, *pDst = NULL; /* Pointers to source/destination images */ int srcStep1 = 0, srcStep2 = 0, dstStep = 0; /* Steps, in bytes, through the source/destination images */ IppiSize dstSize = { out_width, out_height }; /* Size of destination ROI in pixels */ IppiSize src1Size = { 3, 5 }; /* Size of destination ROI in pixels */ IppiSize src2Size = { 3, 5 }; /* Size of destination ROI in pixels */ int divisor = 1; /* The integer value by which the computed result is divided */ Ipp8u *pBuffer = NULL; /* Pointer to the work buffer */ int iTmpBufSize = 0; /* Common work buffer size */ int numChannels = 1; IppEnum funCfgFull = (IppEnum)(ippAlgAuto | ippiROIFull | ippiNormNone); pSrc2 = ippiMalloc_16s_C1(src2Size.width, src2Size.height, &srcStep2); pSrc1 = ippiMalloc_16s_C1(src1Size.width, src1Size.height, &srcStep1); pDst = ippiMalloc_16s_C1(dstSize.width, dstSize.height, &dstStep); do { status = ippiConvGetBufferSize(src1Size, src2Size, ipp16s, numChannels, funCfgFull, &iTmpBufSize); if (ippStsNoErr != status) break; pBuffer = ippsMalloc_8u(iTmpBufSize); pSrc2[0] = pSrc1[0] = 1; pSrc2[1] = pSrc1[1] = 1; pSrc2[2] = pSrc1[2] = 1; pSrc2[3] = pSrc1[3] = 1; pSrc2[4] = pSrc1[4] = 0; pSrc2[5] = pSrc1[5] = 0; pSrc2[6] = pSrc1[6] = 1; pSrc2[7] = pSrc1[7] = 1; pSrc2[8] = pSrc1[8] = 1; pSrc2[9] = pSrc1[9] = 0; pSrc2[10] = pSrc1[10] = 0; pSrc2[11] = pSrc1[11] = 1; pSrc2[12] = pSrc1[12] = 1; pSrc2[13] = pSrc1[13] = 1; pSrc2[14] = pSrc1[14] = 1; status = ippiConv_16s_C1R(pSrc1, srcStep1, src1Size, pSrc2, srcStep2, src2Size, pDst, dstStep, divisor, funCfgFull, pBuffer); if (ippStsNoErr != status) break; for (int j = 0; j < out_height; j++) { for (int i = 0; i < out_width; i++) { cout << pDst[i + j*out_width] << ""; } cout << endl; } } while (false); ippsFree(pBuffer); ippiFree(pSrc1); ippiFree(pSrc2); ippiFree(pDst);