Commit ca97576b authored by Leonardo Solis's avatar Leonardo Solis
Browse files

refactored LS-loops to remove bottleneck with II=15


Former-commit-id: f78cfe4b
parent a8b7ab8b
......@@ -122,11 +122,25 @@ while(valid) {
#endif
mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[0]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate
// rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[0]);
mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS1)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
......@@ -285,9 +299,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS1_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[0], evalenergy);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
......
......@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[1], (rho < DockConst_rho_lower_bound)?true:false);
#endif
mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[1]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate
// rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[1]);
mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS2)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
......@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS2_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[1], evalenergy);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
......
......@@ -125,11 +125,25 @@ while(valid) {
#endif
mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[2]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate
// rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[2]);
mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS3)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
......@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS3_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[2], evalenergy);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
......
......@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[3], (rho < DockConst_rho_lower_bound)?true:false);
#endif
mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[3]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate
// rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[3]);
mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS4)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
......@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS4_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[3], evalenergy);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
......
......@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[4], (rho < DockConst_rho_lower_bound)?true:false);
#endif
mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[4]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate
// rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[4]);
mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS5)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
......@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS5_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[4], evalenergy);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
......
......@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[5], (rho < DockConst_rho_lower_bound)?true:false);
#endif
mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[5]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate
// rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[5]);
mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS6)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
......@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS6_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[5], evalenergy);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
......
......@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[6], (rho < DockConst_rho_lower_bound)?true:false);
#endif
mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[6]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate
// rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[6]);
mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS7)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
......@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS7_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[6], evalenergy);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
......
......@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[7], (rho < DockConst_rho_lower_bound)?true:false);
#endif
mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[7]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate
// rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[7]);
mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS8)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
......@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS8_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[7], evalenergy);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
......
......@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[8], (rho < DockConst_rho_lower_bound)?true:false);
#endif
mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[8]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate
// rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[8]);
mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS9)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
......@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS9_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[8], evalenergy);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment