Commit ca97576b authored by Leonardo Solis's avatar Leonardo Solis
Browse files

refactored LS-loops to remove bottleneck with II=15


Former-commit-id: f78cfe4b
parent a8b7ab8b
...@@ -122,11 +122,25 @@ while(valid) { ...@@ -122,11 +122,25 @@ while(valid) {
#endif #endif
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[0]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate // new random deviate
// rho is the deviation of the uniform distribution // rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[0]); float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[0]);
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS1) #if defined (FIXED_POINT_LS1)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng; fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
...@@ -285,9 +299,6 @@ while(valid) { ...@@ -285,9 +299,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) { if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy}; float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS1_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[0], evalenergy); write_channel_intel(chan_LS2GA_evalenergy[0], evalenergy);
} }
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
......
...@@ -124,12 +124,26 @@ while(valid) { ...@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[1], (rho < DockConst_rho_lower_bound)?true:false); write_channel_intel(chan_LS2Arbiter_end[1], (rho < DockConst_rho_lower_bound)?true:false);
#endif #endif
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[1]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate // new random deviate
// rho is the deviation of the uniform distribution // rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[1]); float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[1]);
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS2) #if defined (FIXED_POINT_LS2)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng; fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
...@@ -292,9 +306,6 @@ while(valid) { ...@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) { if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy}; float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS2_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[1], evalenergy); write_channel_intel(chan_LS2GA_evalenergy[1], evalenergy);
} }
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
......
...@@ -125,11 +125,25 @@ while(valid) { ...@@ -125,11 +125,25 @@ while(valid) {
#endif #endif
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[2]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate // new random deviate
// rho is the deviation of the uniform distribution // rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[2]); float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[2]);
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS3) #if defined (FIXED_POINT_LS3)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng; fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
...@@ -292,9 +306,6 @@ while(valid) { ...@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) { if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy}; float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS3_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[2], evalenergy); write_channel_intel(chan_LS2GA_evalenergy[2], evalenergy);
} }
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
......
...@@ -124,12 +124,26 @@ while(valid) { ...@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[3], (rho < DockConst_rho_lower_bound)?true:false); write_channel_intel(chan_LS2Arbiter_end[3], (rho < DockConst_rho_lower_bound)?true:false);
#endif #endif
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[3]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate // new random deviate
// rho is the deviation of the uniform distribution // rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[3]); float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[3]);
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS4) #if defined (FIXED_POINT_LS4)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng; fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
...@@ -292,9 +306,6 @@ while(valid) { ...@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) { if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy}; float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS4_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[3], evalenergy); write_channel_intel(chan_LS2GA_evalenergy[3], evalenergy);
} }
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
......
...@@ -124,12 +124,26 @@ while(valid) { ...@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[4], (rho < DockConst_rho_lower_bound)?true:false); write_channel_intel(chan_LS2Arbiter_end[4], (rho < DockConst_rho_lower_bound)?true:false);
#endif #endif
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[4]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate // new random deviate
// rho is the deviation of the uniform distribution // rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[4]); float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[4]);
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS5) #if defined (FIXED_POINT_LS5)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng; fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
...@@ -292,9 +306,6 @@ while(valid) { ...@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) { if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy}; float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS5_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[4], evalenergy); write_channel_intel(chan_LS2GA_evalenergy[4], evalenergy);
} }
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
......
...@@ -124,12 +124,26 @@ while(valid) { ...@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[5], (rho < DockConst_rho_lower_bound)?true:false); write_channel_intel(chan_LS2Arbiter_end[5], (rho < DockConst_rho_lower_bound)?true:false);
#endif #endif
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[5]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate // new random deviate
// rho is the deviation of the uniform distribution // rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[5]); float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[5]);
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS6) #if defined (FIXED_POINT_LS6)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng; fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
...@@ -292,9 +306,6 @@ while(valid) { ...@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) { if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy}; float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS6_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[5], evalenergy); write_channel_intel(chan_LS2GA_evalenergy[5], evalenergy);
} }
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
......
...@@ -124,12 +124,26 @@ while(valid) { ...@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[6], (rho < DockConst_rho_lower_bound)?true:false); write_channel_intel(chan_LS2Arbiter_end[6], (rho < DockConst_rho_lower_bound)?true:false);
#endif #endif
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[6]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate // new random deviate
// rho is the deviation of the uniform distribution // rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[6]); float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[6]);
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS7) #if defined (FIXED_POINT_LS7)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng; fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
...@@ -292,9 +306,6 @@ while(valid) { ...@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) { if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy}; float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS7_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[6], evalenergy); write_channel_intel(chan_LS2GA_evalenergy[6], evalenergy);
} }
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
......
...@@ -124,12 +124,26 @@ while(valid) { ...@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[7], (rho < DockConst_rho_lower_bound)?true:false); write_channel_intel(chan_LS2Arbiter_end[7], (rho < DockConst_rho_lower_bound)?true:false);
#endif #endif
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[7]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate // new random deviate
// rho is the deviation of the uniform distribution // rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[7]); float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[7]);
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS8) #if defined (FIXED_POINT_LS8)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng; fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
...@@ -292,9 +306,6 @@ while(valid) { ...@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) { if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy}; float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS8_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[7], evalenergy); write_channel_intel(chan_LS2GA_evalenergy[7], evalenergy);
} }
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
......
...@@ -124,12 +124,26 @@ while(valid) { ...@@ -124,12 +124,26 @@ while(valid) {
write_channel_intel(chan_LS2Arbiter_end[8], (rho < DockConst_rho_lower_bound)?true:false); write_channel_intel(chan_LS2Arbiter_end[8], (rho < DockConst_rho_lower_bound)?true:false);
#endif #endif
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
// read pnrgs from prng_kernel using channels.
// these and following loop were initially merged for deeper pipelining.
// however the prng channel-read created a bottleneck II=15.
// splitted loops have each II=1.
float float_prng [ACTUAL_GENOTYPE_LENGTH];
for (uchar i=0; i<DockConst_num_of_genes; i++) {
float_prng [i] = read_channel_intel(chan_PRNG2LS_float_prng[8]);
}
mem_fence(CLK_CHANNEL_MEM_FENCE);
// new random deviate // new random deviate
// rho is the deviation of the uniform distribution // rho is the deviation of the uniform distribution
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
/*
float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[8]); float tmp_prng = read_channel_intel(chan_PRNG2LS_float_prng[8]);
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
*/
float tmp_prng = float_prng[i];
#if defined (FIXED_POINT_LS9) #if defined (FIXED_POINT_LS9)
fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng; fixedpt fixpt_tmp_prng = *(fixedpt*) &tmp_prng;
...@@ -292,9 +306,6 @@ while(valid) { ...@@ -292,9 +306,6 @@ while(valid) {
for (uchar i=0; i<DockConst_num_of_genes; i++) { for (uchar i=0; i<DockConst_num_of_genes; i++) {
if (i == 0) { if (i == 0) {
float2 evalenergy = {*(float*)&LS_eval, current_energy}; float2 evalenergy = {*(float*)&LS_eval, current_energy};
/*
write_channel_intel(chan_LS2GA_LS9_evalenergy, evalenergy);
*/
write_channel_intel(chan_LS2GA_evalenergy[8], evalenergy); write_channel_intel(chan_LS2GA_evalenergy[8], evalenergy);
} }
mem_fence(CLK_CHANNEL_MEM_FENCE); mem_fence(CLK_CHANNEL_MEM_FENCE);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment