Hello,
I have a problem with low porformans of IPP library on smp multicore system.( have an i7-4700EQ processor and Hyperthreading disabled.) I want to use 4 cores with parallel. I compiled below code and run it. With only 1 core time measurement was 394us. But with 4 cores, i saw 1191us for each cores. How can it possible? I should have seen about 394us, right? Because I used independent tasks and independent memories. And why "IPP BUF SIZES:" different? I think these values should be same.
code;
float *fInR, *fInI;
InputStruct InputTest;
double dtime1[4], dtime2[4];
#define FORLOOP 1000
void cfrRSIAna(int iWeight, int iHeight, int iAffin)
{
ippInit();
int iFFTOrderWeight;
IppsFFTSpec_C_32f *specWeight[4] = {NULL, NULL, NULL, NULL};
Ipp8u* specbufWeight[4] = {NULL, NULL, NULL, NULL};
Ipp8u* specinitWeight[4] = {NULL, NULL, NULL, NULL};
Ipp8u* workbufWeight[4] = {NULL, NULL, NULL, NULL};
int iSpecSizeWeight[4]={0,0,0,0}, iSpecInitSizeWeight[4]={0,0,0,0}, iWorkBufSizeWeight[4]={0,0,0,0};
TASK_ID tids[4]; /* some task IDs */
char taskIsmi[32];
int i, cpuIx[] = {0,1,2,3}; /* core ID'ler*/
phys_cpuset_t affinity;
libinfo(); /* ipp version */
InputTest.iWeight = iWeight;
InputTest.iHeight = iHeight;
strcpy(cXmlDosyaAdi, cCfrRSIXml);/* aksi takdirde, caller dosya adini free ederse yanlis deger okunabilir */
fInR = memalign(128, 4*1024*1024*4);
fInI = memalign(128, 4*1024*1024*4);
fOutR = memalign(128, 4*1024*1024*4);
fOutI = memalign(128, 4*1024*1024*4);
for(i=0; i<iAffin*iWeight*iHeight; i++)
{
fInR[i] = rand()/(float)rand();
fInI[i] = rand()*1.315/(float)rand();
}
InputTest.fpInputR = fInR;
InputTest.fpInputI = fInI;
InputTest.fpOutputR = fOutR;
InputTest.fpOutputI = fOutI;
iFFTOrderWeight = (int)(LOG2_X(iWeight));
printf("fftOrder:%d\n", iFFTOrderWeight);
for(i=0; i<iAffin; i++)
{
ippsFFTGetSize_C_32f(iFFTOrderWeight, IPP_FFT_NODIV_BY_ANY, ippAlgHintNone, &iSpecSizeWeight[i], &iSpecInitSizeWeight[i], &iWorkBufSizeWeight[i]);
if(iSpecSizeWeight[i]) {specbufWeight[i] = ippsMalloc_8u(iSpecSizeWeight[i]);}
if(iSpecInitSizeWeight[i]) {specinitWeight[i] = ippsMalloc_8u(iSpecInitSizeWeight[i]);}
if(iWorkBufSizeWeight[i]) {workbufWeight[i] = ippsMalloc_8u(iWorkBufSizeWeight[i]);}
ippsFFTInit_C_32f(&specWeight[i], iFFTOrderWeight, IPP_FFT_NODIV_BY_ANY, ippAlgHintNone, specbufWeight[i], specinitWeight[i]);
InputTest.specWeight[i] = specWeight[i];
InputTest.workbufWeight[i] = workbufWeight[i];
printf("IPP BUF SIZES: specSize:%d InitSize:%d WorkSize:%d specbuffDop:%d workbuffDopp:%d\n", iSpecSizeWeight[i], iSpecInitSizeWeight[i], iWorkBufSizeWeight[i],specWeight[i],workbufWeight[i]);
}
/*************************************************************************/
printf("Cores setting...\n");
for(i=0; i<iAffin; i++)
{
PHYS_CPUSET_ZERO(affinity);
PHYS_CPUSET_SET(affinity, cpuIx[i]);
sprintf(taskIsmi, "t%s%d", "testIPP", i);
tids[i] = taskCreate(taskIsmi, 120, TASK_OPTIONS, 65536, (FUNCPTR)IPPTestFunc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
printf("Task create edildi:0x%08x\n", tids[i]);
if (tids[i] == NULL)
{
/*return (ERROR);*/
printf("Task create hatasi:0x%08x\n", tids[i]);
}
if(iAffin != -1)
{
printf("Kontrol %d\n", i);
/* Clear the affinity CPU set and set index for CPU */
if (taskCpuAffinitySet(tids[i], affinity) == ERROR)
{
/* Either CPUs are not enabled or we are in UP mode */
printf("Affinity set edilemedi !!!test_tap\n");
taskDelete(tids[i]);
/*return (ERROR);*/
}
taskDelay(sysClkRateGet()/10);
taskCpuAffinityGet(tids[i], &affinity);
printf("Task Affinity:%d\n", affinity);
}
}
for(i=0; i<iAffin; i++)
{
taskActivate(tids[i]);
}
taskDelay(sysClkRateGet()* 4); /* for finish all cores.*/
for(i=0; i<iAffin; i++)
{
printf("\nStartTime[%d]=%f FinishTime[%d]=%f ExecutionTimeForCore[%d]=%f us\n", i, dtime1[i], i, dtime2[i], i, (dtime2[i]-dtime1[i])/FORLOOP);
}
for(i=0; i<iAffin; i++)
{
taskDelete(tids[i]);
}
for(i=0; i<iAffin; i++)
{
ippsFFTFree_C_32f(specWeight[i]);
ippFree(specbufWeight[i]);
ippFree(specinitWeight[i]);
ippFree(workbufWeight[i]);
}
/* mem free*/
}
void IPPTestFunc(void)
{
ippInit();
int iCpuId = vxCpuPhysIndexGet();
IppsFFTSpec_C_32f *specDop = NULL;
unsigned char *workbufDop = NULL;
int iWeight, iHeight, i;
float *fInReal, *fInImag, *fOutReal, *fOutImag;
iWeight = InputTest->iWeight;
iHeight = InputTest->iHeight;
fInReal = InputTest->fpInputR+iCpuId*iWeight*Height;
fInImag = InputTest->fpInputI+iCpuId*iWeight*Height;
fOutReal = InputTest->fpOutputR+iCpuId*iWeight*Height;
fOutImag = InputTest->fpOutputI+iCpuId*iWeight*Height;
specDop = InputTest->specDoppler[iCpuId];
workbufDop = InputTest->workbufDoppler[iCpuId];
dtime1[iCpuId] = getTimeDouble(2);
for(i=0; i<iHeight; i++)
{
ippsFFTFwd_CToC_32f(fInReal+i*iWeight, fInImag+i*iWeight, fOutReal+i*iWeight, fOutImag+i*iWeight, specDop, workbufDop);
}
dtime2[iCpuId] = getTimeDouble(2);
}
screen;
value = IPP Versiyon---> -140737483331680ippSP AVX2 (l9) = 0xffff8000004ca7a0 8.1.0 (r41883) 8.1.0.41883
fftOrder:7
IPP BUF SIZES: specSize:1536 InitSize:0 WorkSize:1088 specbuffDop:859210432 workbuffDopp:5102080
IPP BUF SIZES: specSize:1536 InitSize:0 WorkSize:1088 specbuffDop:859825472 workbuffDopp:859827072
IPP BUF SIZES: specSize:1536 InitSize:0 WorkSize:1088 specbuffDop:252225216 workbuffDopp:252226816
IPP BUF SIZES: specSize:1536 InitSize:0 WorkSize:1088 specbuffDop:252228032 workbuffDopp:252229632
Cores setting...
Task create edildi:0x0f034350
Kontrol 0
Task Affinity:1
Task create edildi:0x333279b0
Kontrol 1
Task Affinity:2
Task create edildi:0x0f081790
Kontrol 2
Task Affinity:4
Task create edildi:0x0f0b0010
Kontrol 3
Task Affinity:8
StartTime[0]=1033823205.286667 FinishTime[0]=1034757013.596667 ExecutionTimeForCore[0]=933.808310 us
StartTime[1]=1033823224.323333 FinishTime[1]=1035062267.645000 ExecutionTimeForCore[1]=1239.043322 us
StartTime[2]=1033823204.383333 FinishTime[2]=1035014816.198333 ExecutionTimeForCore[2]=1191.611815 us
StartTime[3]=1033823206.416667 FinishTime[3]=1034973920.471667 ExecutionTimeForCore[3]=1150.714055 us
value = IPP Versiyon---> -140737483331680ippSP AVX2 (l9) = 0xffff8000004ca7a0 8.1.0 (r41883) 8.1.0.41883
fftOrder:7
IPP BUF SIZES: specSize:1536 InitSize:0 WorkSize:1088 specbuffDop:5120064 workbuffDopp:251123968
Cores setting...
Task create edildi:0x0ef7dd90
Kontrol 0
Task Affinity:1
StartTime[0]=946258083.425000 FinishTime[0]=946652897.046667 ExecutionTimeForCore[0]=394.813622 us