Commit fbf5743a authored by Leonardo Solis's avatar Leonardo Solis

merging fastergrad onto master

parents ebf9a4b0 2ce54688
......@@ -50,7 +50,10 @@ K1_NAME="gpu_calc_initpop"
K2_NAME="gpu_sum_evals"
K3_NAME="perform_LS"
K4_NAME="gpu_gen_and_eval_newpops"
K_NAMES=-DK1=$(K1_NAME) -DK2=$(K2_NAME) -DK3=$(K3_NAME) -DK4=$(K4_NAME)
K5_NAME="gradient_minSD"
K6_NAME="gradient_minFire"
K7_NAME="gradient_minAD"
K_NAMES=-DK1=$(K1_NAME) -DK2=$(K2_NAME) -DK3=$(K3_NAME) -DK4=$(K4_NAME) -DK5=$(K5_NAME) -DK6=$(K6_NAME) -DK7=$(K7_NAME)
# Kernel flags
KFLAGS=-DKRNL_SOURCE=$(KRNL_DIR)/$(KRNL_MAIN) -DKRNL_DIRECTORY=$(KRNL_DIR) -DKCMN_DIRECTORY=$(KCMN_DIR) $(K_NAMES)
......@@ -95,8 +98,10 @@ else
endif
# ------------------------------------------------------
# Configuration (Host)
# Valid values: RELEASE, DEBUG
# Configuration
# FDEBUG (full) : enables debugging on both host + device
# LDEBUG (light): enables debugging on host
# RELEASE
CONFIG=RELEASE
OCL_DEBUG_BASIC=-DPLATFORM_ATTRIBUTES_DISPLAY\
......@@ -113,7 +118,9 @@ OCL_DEBUG_ALL=$(OCL_DEBUG_BASIC) \
-DKERNEL_WORK_GROUP_INFO_DISPLAY \
-DBUFFER_OBJECT_INFO_DISPLAY
ifeq ($(CONFIG),DEBUG)
ifeq ($(CONFIG),FDEBUG)
OPT =-O0 -g3 -Wall $(OCL_DEBUG_ALL) -DDOCK_DEBUG
else ifeq ($(CONFIG),LDEBUG)
OPT =-O0 -g3 -Wall $(OCL_DEBUG_BASIC)
else ifeq ($(CONFIG),RELEASE)
OPT =-O3
......@@ -122,18 +129,9 @@ else
endif
# ------------------------------------------------------
# Host and Device Debug
DOCK_DEBUG=NO
# Reproduce results (remove randomness)
REPRO=NO
ifeq ($(DOCK_DEBUG),YES)
DD =-DDOCK_DEBUG
else
DD =
endif
ifeq ($(REPRO),YES)
REP =-DREPRO
else
......@@ -188,11 +186,66 @@ check-env-gpu:
check-env-all: check-env-dev check-env-cpu check-env-gpu
# ------------------------------------------------------
# Priting out its git version hash
GIT_VERSION := $(shell git describe --abbrev=40 --dirty --always --tags)
CFLAGS+=-DVERSION=\"$(GIT_VERSION)\"
# ------------------------------------------------------
stringify:
./stringify_ocl_krnls.sh
odock: check-env-all stringify $(SRC)
g++ $(SRC) $(CFLAGS) -lOpenCL -o$(BIN_DIR)/$(TARGET) $(DEV) $(NWI) $(OPT) $(DD) $(REP) $(KFLAGS)
g++ \
$(SRC) \
$(CFLAGS) \
-lOpenCL \
-o$(BIN_DIR)/$(TARGET) \
$(DEV) $(NWI) $(OPT) $(DD) $(REP) $(KFLAGS)
# Example
# 1ac8: for testing gradients of translation and rotation genes
# 7cpa: for testing gradients of torsion genes (15 torsions)
# 3tmn: for testing gradients of torsion genes (1 torsion)
PDB := 3ce3
NRUN := 100
NGEN := 27000
POPSIZE := 150
TESTNAME := test
TESTLS := sw
test: odock
$(BIN_DIR)/$(TARGET) \
-ffile ./input/$(PDB)/derived/$(PDB)_protein.maps.fld \
-lfile ./input/$(PDB)/derived/$(PDB)_ligand.pdbqt \
-nrun $(NRUN) \
-ngen $(NGEN) \
-psize $(POPSIZE) \
-resnam $(TESTNAME) \
-gfpop 0 \
-lsmet $(TESTLS)
ASTEX_PDB := 2bsm
ASTEX_NRUN:= 10
ASTEX_POPSIZE := 10
ASTEX_TESTNAME := test_astex
ASTEX_LS := sw
astex: odock
$(BIN_DIR)/$(TARGET) \
-ffile ./input_tsri/search-set-astex/$(ASTEX_PDB)/protein.maps.fld \
-lfile ./input_tsri/search-set-astex/$(ASTEX_PDB)/flex-xray.pdbqt \
-nrun $(ASTEX_NRUN) \
-psize $(ASTEX_POPSIZE) \
-resnam $(ASTEX_TESTNAME) \
-gfpop 1 \
-lsmet $(ASTEX_LS)
# $(BIN_DIR)/$(TARGET) -ffile ./input_tsri/search-set-astex/$(ASTEX_PDB)/protein.maps.fld -lfile ./input_tsri/search-set-astex/$(ASTEX_PDB)/flex-xray.pdbqt -nrun $(ASTEX_NRUN) -psize $(ASTEX_POPSIZE) -resnam $(ASTEX_TESTNAME) -gfpop 1 | tee ./input_tsri/search-set-astex/intrapairs/$(ASTEX_PDB)_intrapair.txt
PDB := 3ce3
NRUN := 100
......
......@@ -98,6 +98,7 @@ By default the output log file is written in the current working folder. Example
| -lsrat | Local-search rate | 6 (%) |
| -trat | Tournament rate | 60 (%) |
| -resnam | Name for docking output log | _"docking"_ |
| -hsym | Handle symmetry in RMSD calc. | 1 |
For a complete list of available arguments and their default values, check: [getparameters.cpp](host/src/getparameters.cpp).
......
......@@ -27,19 +27,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#include "defines.h"
#define RLIST_ATOMID_MASK 0x000000FF
#define RLIST_RBONDID_MASK 0x0000FF00
#define RLIST_RBONDID_SHIFT 8
#define RLIST_FIRSTROT_MASK 0x00010000
#define RLIST_GENROT_MASK 0x00020000
#define RLIST_DUMMY_MASK 0x00040000
#define DEG_TO_RAD 0.0174533f
#define RLIST_ATOMID_MASK 0x000000FF
#define RLIST_RBONDID_MASK 0x0000FF00
#define RLIST_RBONDID_SHIFT 8
#define RLIST_FIRSTROT_MASK 0x00010000
#define RLIST_GENROT_MASK 0x00020000
#define RLIST_DUMMY_MASK 0x00040000
#define DEG_TO_RAD 0.0174533f
// LCG: linear congruential generator constants
#define RAND_A 1103515245u
#define RAND_C 12345u
#define RAND_A 1103515245u
#define RAND_C 12345u
// WARNING: it is supposed that unsigned int is 32 bit long
#define MAX_UINT 4294967296.0f
#define MAX_UINT 4294967296.0f
// Macro for capturing grid values
// Original
......@@ -57,4 +57,49 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
cube[0][0][1]*weights[0][0][1] +cube[1][0][1]*weights[1][0][1] + \
cube[0][1][1]*weights[0][1][1] +cube[1][1][1]*weights[1][1][1])
// Constants for dielelectric term of the
// electrostatic component of the intramolecular energy/gradient
#define DIEL_A -8.5525f
#define DIEL_WAT 78.4f
#define DIEL_B (DIEL_WAT - DIEL_A)
#define DIEL_LAMBDA 0.003627f
#define DIEL_H DIEL_LAMBDA
#define DIEL_K 7.7839f
#define DIEL_B_TIMES_H (DIEL_B * DIEL_H)
#define DIEL_B_TIMES_H_TIMES_K (DIEL_B_TIMES_H * DIEL_K)
// Used for Shoemake to quaternion transformation
#define PI_TIMES_2 (float)(2.0f*M_PI)
#define PI_FLOAT (float)(M_PI)
// -------------------------------------------
// Gradient-related defines
// -------------------------------------------
#define INFINITESIMAL_RADIAN 1E-3
#define HALF_INFINITESIMAL_RADIAN (0.5f * INFINITESIMAL_RADIAN)
#define INV_INFINITESIMAL_RADIAN (1/INFINITESIMAL_RADIAN)
#define COS_HALF_INFINITESIMAL_RADIAN cos(HALF_INFINITESIMAL_RADIAN)
#define SIN_HALF_INFINITESIMAL_RADIAN sin(HALF_INFINITESIMAL_RADIAN)
/*
#define TRANGENE_ALPHA 1E-3
#define ROTAGENE_ALPHA 1E-8
#define TORSGENE_ALPHA 1E-13
*/
#define STEP_INCREASE 1.2f
#define STEP_DECREASE 0.2f
#define STEP_START 1E3 // Starting step size. This might look gigantic but will cap
#define MAX_DEV_TRANSLATION 2.0f // 2 Angstrom, but must be divided by the gridspacing (store in variable)
//#define MAX_DEV_ROTATION 0.2f // Shoemake range [0, 1]
#define MAX_DEV_ROTATION 0.5f/DEG_TO_RAD // 0.5f RAD
#define MAX_DEV_TORSION 0.5f/DEG_TO_RAD // 0.5f RAD
#endif /* CALCENERGY_BASIC_H_ */
......@@ -39,7 +39,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#define NUM_OF_THREADS_PER_BLOCK 64
#endif
#define ATYPE_NUM 22
#define ATYPE_NUM 22 // initial: 22
#define ATYPE_GETBONDS 16 // initial: 16
#define MAX_NUM_OF_ATOMS 256
#define MAX_NUM_OF_ATYPES 14
#define MAX_NUM_OF_ROTBONDS 32
......@@ -47,19 +48,27 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#define MAX_NUM_OF_ROTATIONS (MAX_NUM_OF_ATOMS * MAX_NUM_OF_ROTBONDS)
#define MAX_POPSIZE 2048
#define MAX_NUM_OF_RUNS 1000
#define MAX_NUM_GRIDPOINTS 256
// Must be bigger than MAX_NUM_OF_ROTBONDS+6
#define GENOTYPE_LENGTH_IN_GLOBMEM 64
#define ACTUAL_GENOTYPE_LENGTH (MAX_NUM_OF_ROTBONDS+6)
#define LS_EXP_FACTOR 2.0f
#define LS_CONT_FACTOR 0.5f
#define LS_EXP_FACTOR 2.0f
#define LS_CONT_FACTOR 0.5f
// Improvements over Pechan's implementation
#define NATIVE_PRECISION
#define ASYNC_COPY
#define IMPROVE_GRID
#define RESTRICT_ARGS
#define MAPPED_COPY
// TODO: convert this into a program arg
//#define GRADIENT_ENABLED
#endif /* DEFINES_H_ */
......@@ -25,12 +25,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
// -------------------------------------------------------
//
// -------------------------------------------------------
unsigned int gpu_rand(
#if defined (RESTRICT_ARGS)
__global unsigned int* restrict prng_states
#else
__global unsigned int* prng_states
#endif
uint gpu_rand(
__global uint* restrict prng_states
)
//The GPU device function generates a random int
//with a linear congruential generator.
......@@ -39,20 +35,20 @@ unsigned int gpu_rand(
//prng_states (thread with ID tx in block with ID bx stores its state in prng_states[bx*NUM_OF_THREADS_PER_BLOCK+$
//The random number generator uses the gcc linear congruential generator constants.
{
unsigned int state;
uint state;
#if defined (REPRO)
state = 1;
#else
//current state of the threads own PRNG
//state = prng_states[get_group_id(0)*NUM_OF_THREADS_PER_BLOCK + get_local_id(0)];
// Current state of the threads own PRNG
// state = prng_states[get_group_id(0)*NUM_OF_THREADS_PER_BLOCK + get_local_id(0)];
state = prng_states[get_global_id(0)];
//calculating next state
state = (RAND_A*state+RAND_C);
// Calculating next state
state = (RAND_A*state+RAND_C);
#endif
//saving next state to memory
//prng_states[get_group_id(0)*NUM_OF_THREADS_PER_BLOCK + get_local_id(0)] = state;
// Saving next state to memory
// prng_states[get_group_id(0)*NUM_OF_THREADS_PER_BLOCK + get_local_id(0)] = state;
prng_states[get_global_id(0)] = state;
return state;
......@@ -62,29 +58,19 @@ unsigned int gpu_rand(
//
// -------------------------------------------------------
float gpu_randf(
#if defined (RESTRICT_ARGS)
__global unsigned int* restrict prng_states
#else
__global unsigned int* prng_states
#endif
__global uint* restrict prng_states
)
//The GPU device function generates a
//random float greater than (or equal to) 0 and less than 1.
//It uses gpu_rand() function.
{
float state;
float state;
//state will be between 0 and 1
// State will be between 0 and 1
#if defined (REPRO)
state = 0.55f; //0.55f;
#else
#if defined (NATIVE_PRECISION)
state = native_divide(gpu_rand(prng_states),MAX_UINT)*0.999999f;
#elif defined (HALF_PRECISION)
state = half_divide(gpu_rand(prng_states),MAX_UINT)*0.999999f;
#else // Full precision
state = (((float) gpu_rand(prng_states))/MAX_UINT)*0.999999f;
#endif
#endif
return state;
......@@ -94,50 +80,33 @@ float gpu_randf(
//
// -------------------------------------------------------
void map_angle(__local float* angle)
//The GPU device function maps
//the input parameter to the interval 0...360
//(supposing that it is an angle).
// The GPU device function maps
// the input parameter to the interval 0...360
// (supposing that it is an angle).
{
while (*angle >= 360.0f)
*angle -= 360.0f;
while (*angle >= 360.0f) {
*angle -= 360.0f;
}
while (*angle < 0.0f)
*angle += 360.0f;
while (*angle < 0.0f) {
*angle += 360.0f;
}
}
// -------------------------------------------------------
//
// -------------------------------------------------------
void gpu_perform_elitist_selection(int dockpars_pop_size,
#if defined (RESTRICT_ARGS)
void gpu_perform_elitist_selection(
int dockpars_pop_size,
__global float* restrict dockpars_energies_current,
__global float* restrict dockpars_energies_next,
__global int* restrict dockpars_evals_of_new_entities,
#else
__global float* dockpars_energies_current,
__global float* dockpars_energies_next,
__global int* dockpars_evals_of_new_entities,
#endif
int dockpars_num_of_genes,
#if defined (RESTRICT_ARGS)
__global float* restrict dockpars_conformations_next,
__global const float* restrict dockpars_conformations_current
#else
__global float* dockpars_conformations_next,
__global const float* dockpars_conformations_current
#endif
,
// Some OpenCL compilers don't allow local var outside kernels
// so this local vars are passed from a kernel
__local float* best_energies,
__local int* best_IDs,
__local int* best_ID
__global const float* restrict dockpars_conformations_current,
__local float* best_energies,
__local int* best_IDs,
__local int* best_ID
)
//The GPU device function performs elitist selection,
//that is, it looks for the best entity in conformations_current and
......@@ -145,68 +114,58 @@ void gpu_perform_elitist_selection(int dockpars_pop_size,
//and copies it to the place of the first entity in
//conformations_next and energies_next.
{
int entity_counter;
int gene_counter;
float best_energy;
// Some OpenCL compilers don't allow local var outside kernels
// so this local vars are passed from a kernel
//__local float best_energies[NUM_OF_THREADS_PER_BLOCK];
//__local int best_IDs[NUM_OF_THREADS_PER_BLOCK];
//__local int best_ID;
if (get_local_id(0) < dockpars_pop_size)
{
if (get_local_id(0) < dockpars_pop_size) {
best_energies[get_local_id(0)] = dockpars_energies_current[get_group_id(0)+get_local_id(0)];
best_IDs[get_local_id(0)] = get_local_id(0);
}
for (entity_counter=NUM_OF_THREADS_PER_BLOCK+get_local_id(0);
entity_counter<dockpars_pop_size;
entity_counter+=NUM_OF_THREADS_PER_BLOCK)
for (entity_counter = NUM_OF_THREADS_PER_BLOCK+get_local_id(0);
entity_counter < dockpars_pop_size;
entity_counter+= NUM_OF_THREADS_PER_BLOCK) {
if (dockpars_energies_current[get_group_id(0)+entity_counter] < best_energies[get_local_id(0)])
{
if (dockpars_energies_current[get_group_id(0)+entity_counter] < best_energies[get_local_id(0)]) {
best_energies[get_local_id(0)] = dockpars_energies_current[get_group_id(0)+entity_counter];
best_IDs[get_local_id(0)] = entity_counter;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//this could be implemented with a tree-like structure
//which may be slightly faster
// This could be implemented with a tree-like structure
// which may be slightly faster
if (get_local_id(0) == 0)
{
best_energy = best_energies[0];
//best_ID = best_IDs[0];
best_ID[0] = best_IDs[0];
for (entity_counter=1;
entity_counter<NUM_OF_THREADS_PER_BLOCK;
entity_counter++)
for (entity_counter = 1;
entity_counter < NUM_OF_THREADS_PER_BLOCK;
entity_counter++) {
if ((best_energies[entity_counter] < best_energy) && (entity_counter < dockpars_pop_size))
{
if ((best_energies[entity_counter] < best_energy) && (entity_counter < dockpars_pop_size)) {
best_energy = best_energies[entity_counter];
//best_ID = best_IDs[entity_counter];
best_ID[0] = best_IDs[entity_counter];
}
}
//setting energy value of new entity
// Setting energy value of new entity
dockpars_energies_next[get_group_id(0)] = best_energy;
//0 evals were performed for entity selected with elitism (since it was copied only)
// Zero (0) evals were performed for entity selected with elitism (since it was copied only)
dockpars_evals_of_new_entities[get_group_id(0)] = 0;
}
//now best_id stores the id of the best entity in the population,
//copying genotype and energy value to the first entity of new population
// "best_id" stores the id of the best entity in the population,
// Copying genotype and energy value to the first entity of new population
barrier(CLK_LOCAL_MEM_FENCE);
for (gene_counter=get_local_id(0);
gene_counter<dockpars_num_of_genes;
gene_counter+=NUM_OF_THREADS_PER_BLOCK)
//dockpars_conformations_next[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0)+gene_counter] = dockpars_conformations_current[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0)+GENOTYPE_LENGTH_IN_GLOBMEM*best_ID+gene_counter];
dockpars_conformations_next[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0)+gene_counter] = dockpars_conformations_current[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0)+GENOTYPE_LENGTH_IN_GLOBMEM*best_ID[0]+gene_counter];
for (gene_counter = get_local_id(0);
gene_counter < dockpars_num_of_genes;
gene_counter+= NUM_OF_THREADS_PER_BLOCK) {
dockpars_conformations_next[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0)+gene_counter] = dockpars_conformations_current[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0) + GENOTYPE_LENGTH_IN_GLOBMEM*best_ID[0]+gene_counter];
}
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -21,68 +21,68 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
//#define DEBUG_ENERGY_KERNEL1
__kernel void __attribute__ ((reqd_work_group_size(NUM_OF_THREADS_PER_BLOCK,1,1)))
gpu_calc_initpop( char dockpars_num_of_atoms,
gpu_calc_initpop(
char dockpars_num_of_atoms,
char dockpars_num_of_atypes,
int dockpars_num_of_intraE_contributors,
char dockpars_gridsize_x,
char dockpars_gridsize_y,
char dockpars_gridsize_z,
// g1 = gridsize_x
uint dockpars_gridsize_x_times_y, // g2 = gridsize_x * gridsize_y
uint dockpars_gridsize_x_times_y_times_z, // g3 = gridsize_x * gridsize_y * gridsize_z
float dockpars_grid_spacing,
#if defined (RESTRICT_ARGS)
__global const float* restrict dockpars_fgrids, // cannot be allocated in __constant (too large)
#else
__global const float* dockpars_fgrids, // cannot be allocated in __constant (too large)
#endif
__global const float* restrict dockpars_fgrids, // This is too large to be allocated in __constant
int dockpars_rotbondlist_length,
float dockpars_coeff_elec,
float dockpars_coeff_desolv,
#if defined (RESTRICT_ARGS)
__global const float* restrict dockpars_conformations_current,
__global float* restrict dockpars_energies_current,
__global int* restrict dockpars_evals_of_new_entities,
#else
__global const float* dockpars_conformations_current,
__global float* dockpars_energies_current,
__global int* dockpars_evals_of_new_entities,
#endif
__global const float* restrict dockpars_conformations_current,
__global float* restrict dockpars_energies_current,
__global int* restrict dockpars_evals_of_new_entities,
int dockpars_pop_size,
float dockpars_qasp,
float dockpars_smooth,
float dockpars_smooth,
__constant kernelconstant_interintra* kerconst_interintra,
__global const kernelconstant_intracontrib* kerconst_intracontrib,
__constant kernelconstant_intra* kerconst_intra,
__constant kernelconstant_rotlist* kerconst_rotlist,
__constant kernelconstant_conform* kerconst_conform
__constant kernelconstant_interintra* kerconst_interintra,
__global const kernelconstant_intracontrib* kerconst_intracontrib,
__constant kernelconstant_intra* kerconst_intra,
__constant kernelconstant_rotlist* kerconst_rotlist,
__constant kernelconstant_conform* kerconst_conform
){
// Some OpenCL compilers don't allow declaring
// local variables within non-kernel functions.
// These local variables must be declared in a kernel,
// and then passed to non-kernel functions.
__local float genotype[ACTUAL_GENOTYPE_LENGTH];
__local float energy;
__local int run_id;
// Some OpenCL compilers don't allow local var outside kernels
// so this local vars are passed from a kernel
__local float calc_coords_x[MAX_NUM_OF_ATOMS];
__local float calc_coords_y[MAX_NUM_OF_ATOMS];
__local float calc_coords_z[MAX_NUM_OF_ATOMS];
__local float partial_energies[NUM_OF_THREADS_PER_BLOCK];
#if defined (DEBUG_ENERGY_KERNEL)
__local float partial_interE[NUM_OF_THREADS_PER_BLOCK];
__local float partial_intraE[NUM_OF_THREADS_PER_BLOCK];
#endif
// Copying genotype from global memory
event_t ev = async_work_group_copy(genotype,
dockpars_conformations_current + GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0),
ACTUAL_GENOTYPE_LENGTH, 0);
//determining run ID
// Determining run-ID
if (get_local_id(0) == 0) {
run_id = get_group_id(0) / dockpars_pop_size;
}
// Asynchronous copy should be finished by here
wait_group_events(1,&ev);
wait_group_events(1, &ev);
// Evaluating initial genotype
// Evaluating initial genotypes
barrier(CLK_LOCAL_MEM_FENCE);
// =============================================================
......@@ -91,6 +91,9 @@ gpu_calc_initpop( char dockpars_num_of_atoms,
dockpars_gridsize_x,
dockpars_gridsize_y,
dockpars_gridsize_z,
// g1 = gridsize_x
dockpars_gridsize_x_times_y, // g2 = gridsize_x * gridsize_y
dockpars_gridsize_x_times_y_times_z, // g3 = gridsize_x * gridsize_y * gridsize_z
dockpars_fgrids,
dockpars_num_of_atypes,
dockpars_num_of_intraE_contributors,
......@@ -103,22 +106,35 @@ gpu_calc_initpop( char dockpars_num_of_atoms,
genotype,
&energy,
&run_id,
// Some OpenCL compilers don't allow local var outside kernels
// so this local vars are passed from a kernel
// Some OpenCL compilers don't allow declaring
// local variables within non-kernel functions.
// These local variables must be declared in a kernel,
// and then passed to non-kernel functions.
calc_coords_x,
calc_coords_y,
calc_coords_z,
partial_energies,
kerconst_interintra,
kerconst_intracontrib,
kerconst_intra,
kerconst_rotlist,
kerconst_conform);
#if defined (DEBUG_ENERGY_KERNEL)
partial_interE,
partial_intraE,
#endif
#if 0
false,
#endif
kerconst_interintra,
kerconst_intracontrib,
kerconst_intra,
kerconst_rotlist,
kerconst_conform
);
// =============================================================
if (get_local_id(0) == 0) {
dockpars_energies_current[get_group_id(0)] = energy;
dockpars_evals_of_new_entities[get_group_id(0)] = 1;
#if defined (DEBUG_ENERGY_KERNEL1)
printf("%-18s [%-5s]---{%-5s} [%-10.8f]---{%-10.8f}\n", "-ENERGY-KERNEL1-", "GRIDS", "INTRA", partial_interE[0], partial_intraE[0]);
#endif
}
}
......@@ -23,16 +23,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
__kernel void __attribute__ ((reqd_work_group_size(NUM_OF_THREADS_PER_BLOCK,1,1)))
gpu_sum_evals(/*unsigned long pop_size,*/
unsigned int pop_size,
/*unsigned long num_of_runs,*/
#if defined (RESTRICT_ARGS)
gpu_sum_evals(
uint pop_size,
__global int* restrict dockpars_evals_of_new_entities,
__global int* restrict evals_of_runs
#else
__global int* dockpars_evals_of_new_entities,
__global int* evals_of_runs
#endif
)
//The GPU global function sums the evaluation counter states
//which are stored in evals_of_new_entities array foreach entity,
......@@ -41,28 +35,28 @@ gpu_sum_evals(/*unsigned long pop_size,*/
//since each block performs the summation for one run.
{
int entity_counter;
int sum_evals;
__local int partsum_evals[NUM_OF_THREADS_PER_BLOCK];
int sum_evals;
__local int partsum_evals[NUM_OF_THREADS_PER_BLOCK];
partsum_evals[get_local_id(0)] = 0;
partsum_evals[get_local_id(0)] = 0;
for (entity_counter=get_local_id(0);
entity_counter<pop_size;
entity_counter+=NUM_OF_THREADS_PER_BLOCK) {
partsum_evals[get_local_id(0)] += dockpars_evals_of_new_entities[get_group_id(0)*pop_size+entity_counter];
}
for (entity_counter = get_local_id(0);
entity_counter < pop_size;
entity_counter+= NUM_OF_THREADS_PER_BLOCK) {
partsum_evals[get_local_id(0)] += dockpars_evals_of_new_entities[get_group_id(0)*pop_size + entity_counter];
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) == 0) {
sum_evals = partsum_evals[0];
if (get_local_id(0) == 0) {
sum_evals = partsum_evals[0];
for (entity_counter=1;
entity_counter<NUM_OF_THREADS_PER_BLOCK;
entity_counter++) {
for (entity_counter = 1;
entity_counter < NUM_OF_THREADS_PER_BLOCK;
entity_counter++) {
sum_evals += partsum_evals[entity_counter];
}
}
evals_of_runs[get_group_id(0)] += sum_evals;
}
evals_of_runs[get_group_id(0)] += sum_evals;