Hi Denis,
Thanks! I appreciate any help you can give me.
The function is a little bit unusual. It performs a multiplication between two 8-bit floating-point numbers. Nothing complicated, really, basically a bunch of bitwise operations. It also performs stochastic rounding (as an alternative to e.g. round to nearest), so it needs a rand function.
The MWE below does not make a lot of sense but conveys the main idea: the function fp8_value = fp8_multiply_stochastic(55, t/ms) : 1
receives integer numbers, treats them as floating-point values, and returns the product. It works with cpp but not with cuda.
from brian2 import *
import brian2cuda
set_device("cuda_standalone")
# Works with cpp
#set_device('cpp_standalone')
def fp8_multiply_stochastic(num1, num2, _vectorisation_idx):
# Not implemented
return 0
fp8_multiply_stochastic = Function(fp8_multiply_stochastic, arg_units=[1, 1],
return_unit=1, auto_vectorise=True)
cpp_code = """
int fp8_multiply_stochastic(int num1, int num2, int _vectorisation_idx){
unsigned char sign1, exp1, abs_val1, sign2, exp2, abs_val2, smaller_factor;
unsigned char int_repr1_abs, int_repr2_abs, aux_int_repr, carry;
unsigned char num_leading_zero1=0, num_leading_zero2=0;
unsigned char result, result_sign, result_exp, trunc_result, result_abs;
unsigned char discarded_bits, lfsr, round_factor;
bool is_normal1, is_normal2;
const unsigned char EXP_WIDTH = 4;
const unsigned char FRAC_WIDTH = 3;
const unsigned char FRAC_MASK = (1<<FRAC_WIDTH) - 1;
const unsigned char SIGN_WIDTH = 1;
const unsigned char N_BITS = SIGN_WIDTH + EXP_WIDTH + FRAC_WIDTH;
const unsigned char BIAS = 7;
const unsigned char GUARD_WIDTH = 3;
const unsigned char REPR_MASK = (1<<N_BITS) - 1;
// Smallest normal: 2^-(-BIAS+1)
// Smallest subnormal: 2^(-BIAS+1)/2^(FRAC_WIDTH-1)
// Biased representation of exponents, i.e. what is actually stored in hardware
const char EMIN = 0;
// Code to extract relevant fields of the bitstring
sign1 = num1 >> (EXP_WIDTH+FRAC_WIDTH);
abs_val1 = num1 & 0x7F;
exp1 = abs_val1 >> FRAC_WIDTH;
is_normal1 = abs_val1 >= (1 << FRAC_WIDTH);
// Code to extract relevant fields of the bitstring
sign2 = num2 >> (EXP_WIDTH+FRAC_WIDTH);
abs_val2 = num2 & 0x7F;
exp2 = abs_val2 >> FRAC_WIDTH;
is_normal2 = abs_val2 >= (1 << FRAC_WIDTH);
result_sign = sign1 ^ sign2;
// Multiplication with FRAC_WIDTH+1 LSBs. In hardware, aligning after result
// is obtained when subnormal values are involved gives us time to calculate
// multiplication and number of leading zeros in parallel.
int_repr1_abs = (is_normal1 << FRAC_WIDTH) | (abs_val1 & FRAC_MASK);
int_repr2_abs = (is_normal2 << FRAC_WIDTH) | (abs_val2 & FRAC_MASK);
// result occupies N_BITS bits in the form of cnr1r2...r6
result = int_repr1_abs * int_repr2_abs;
// Code to extract number of leading bits
if (int_repr1_abs==0){
num_leading_zero1 = N_BITS;
}else{
aux_int_repr = int_repr1_abs;
if(aux_int_repr<=0x0F) {aux_int_repr<<=4; num_leading_zero1+=4;}
if(aux_int_repr<=0x3F) {aux_int_repr<<=2; num_leading_zero1+=2;}
if(aux_int_repr<=0x7F) {aux_int_repr<<=1; num_leading_zero1+=1;}
}
// Normalization of result is done after multiplication, according
// to leading zeros of multiplicands. Note that if multiplicand is normal,
// no need to shift result e.g. 1.000*0.001=1000 => 1000<<3 = 1.000000
num_leading_zero1 = num_leading_zero1 - EXP_WIDTH;
if (!is_normal1) result <<= num_leading_zero1;
// Code to extract number of leading bits
if (int_repr2_abs==0){
num_leading_zero2 = N_BITS;
}else{
aux_int_repr = int_repr2_abs;
if(aux_int_repr<=0x0F) {aux_int_repr<<=4; num_leading_zero2+=4;}
if(aux_int_repr<=0x3F) {aux_int_repr<<=2; num_leading_zero2+=2;}
if(aux_int_repr<=0x7F) {aux_int_repr<<=1; num_leading_zero2+=1;}
}
num_leading_zero2 = num_leading_zero2 - EXP_WIDTH;
if (!is_normal2) result <<= num_leading_zero2;
carry = result >> (N_BITS-1);
// This represents biased exponent - 1. Implicit normal bit is added
// afterwards to correctly pack the result
result_exp = (exp1 - is_normal1
+ exp2 - is_normal2
- num_leading_zero1 - num_leading_zero2
+ (1 - BIAS) + carry);
// In hardware, a slightly larger exponent range (herein 1 extra bit) to
// handle subnormals
if((result_exp >> (EXP_WIDTH + 1)) != 0){
result >>= -result_exp;
// Note that no sticky bits are computed from eliminated bits
result_exp = EMIN;
}
smaller_factor = (abs_val1 < abs_val2) ? abs_val1 : abs_val2;
if (smaller_factor == 0) result_exp = EMIN;
discarded_bits = (result & FRAC_MASK<<carry) >> carry;
lfsr = floor(rand(_vectorisation_idx) * (1 << GUARD_WIDTH));
// MSB of truncated result is included so that it is added to exponent,
// which was calculated as the final value minus 1
trunc_result = result >> (GUARD_WIDTH+carry);
round_factor = (discarded_bits+lfsr) >> GUARD_WIDTH;
trunc_result = trunc_result + round_factor;
result_abs = (result_exp<<FRAC_WIDTH) + trunc_result;
// Dealing with overflow. Note that sign bit is used for comparison, so no
// extra space would be needed in hardware
if (result_abs > (1 << (N_BITS-1)) - 1) result_abs = (1 << (N_BITS-1)) - 1;
// Note that negative zeros are not return from operations
if (result_abs==0) result_sign = 0;
return (result_sign << (EXP_WIDTH+FRAC_WIDTH)) + result_abs;
}
"""
cuda_code = """
__host__ __device__ int fp8_multiply_stochastic(int num1, int num2, int _vectorisation_idx){
unsigned char sign1, exp1, abs_val1, sign2, exp2, abs_val2, smaller_factor;
unsigned char int_repr1_abs, int_repr2_abs, aux_int_repr, carry;
unsigned char num_leading_zero1=0, num_leading_zero2=0;
unsigned char result, result_sign, result_exp, trunc_result, result_abs;
unsigned char discarded_bits, lfsr, round_factor;
bool is_normal1, is_normal2;
const unsigned char EXP_WIDTH = 4;
const unsigned char FRAC_WIDTH = 3;
const unsigned char FRAC_MASK = (1<<FRAC_WIDTH) - 1;
const unsigned char SIGN_WIDTH = 1;
const unsigned char N_BITS = SIGN_WIDTH + EXP_WIDTH + FRAC_WIDTH;
const unsigned char BIAS = 7;
const unsigned char GUARD_WIDTH = 3;
const unsigned char REPR_MASK = (1<<N_BITS) - 1;
// Smallest normal: 2^-(-BIAS+1)
// Smallest subnormal: 2^(-BIAS+1)/2^(FRAC_WIDTH-1)
// Biased representation of exponents, i.e. what is actually stored in hardware
const char EMIN = 0;
// Code to extract relevant fields of the bitstring
sign1 = num1 >> (EXP_WIDTH+FRAC_WIDTH);
abs_val1 = num1 & 0x7F;
exp1 = abs_val1 >> FRAC_WIDTH;
is_normal1 = abs_val1 >= (1 << FRAC_WIDTH);
// Code to extract relevant fields of the bitstring
sign2 = num2 >> (EXP_WIDTH+FRAC_WIDTH);
abs_val2 = num2 & 0x7F;
exp2 = abs_val2 >> FRAC_WIDTH;
is_normal2 = abs_val2 >= (1 << FRAC_WIDTH);
result_sign = sign1 ^ sign2;
// Multiplication with FRAC_WIDTH+1 LSBs. In hardware, aligning after result
// is obtained when subnormal values are involved gives us time to calculate
// multiplication and number of leading zeros in parallel.
int_repr1_abs = (is_normal1 << FRAC_WIDTH) | (abs_val1 & FRAC_MASK);
int_repr2_abs = (is_normal2 << FRAC_WIDTH) | (abs_val2 & FRAC_MASK);
// result occupies N_BITS bits in the form of cnr1r2...r6
result = int_repr1_abs * int_repr2_abs;
// Code to extract number of leading bits
if (int_repr1_abs==0){
num_leading_zero1 = N_BITS;
}else{
aux_int_repr = int_repr1_abs;
if(aux_int_repr<=0x0F) {aux_int_repr<<=4; num_leading_zero1+=4;}
if(aux_int_repr<=0x3F) {aux_int_repr<<=2; num_leading_zero1+=2;}
if(aux_int_repr<=0x7F) {aux_int_repr<<=1; num_leading_zero1+=1;}
}
// Normalization of result is done after multiplication, according
// to leading zeros of multiplicands. Note that if multiplicand is normal,
// no need to shift result e.g. 1.000*0.001=1000 => 1000<<3 = 1.000000
num_leading_zero1 = num_leading_zero1 - EXP_WIDTH;
if (!is_normal1) result <<= num_leading_zero1;
// Code to extract number of leading bits
if (int_repr2_abs==0){
num_leading_zero2 = N_BITS;
}else{
aux_int_repr = int_repr2_abs;
if(aux_int_repr<=0x0F) {aux_int_repr<<=4; num_leading_zero2+=4;}
if(aux_int_repr<=0x3F) {aux_int_repr<<=2; num_leading_zero2+=2;}
if(aux_int_repr<=0x7F) {aux_int_repr<<=1; num_leading_zero2+=1;}
}
num_leading_zero2 = num_leading_zero2 - EXP_WIDTH;
if (!is_normal2) result <<= num_leading_zero2;
carry = result >> (N_BITS-1);
// This represents biased exponent - 1. Implicit normal bit is added
// afterwards to correctly pack the result
result_exp = (exp1 - is_normal1
+ exp2 - is_normal2
- num_leading_zero1 - num_leading_zero2
+ (1 - BIAS) + carry);
// In hardware, a slightly larger exponent range (herein 1 extra bit) to
// handle subnormals
if((result_exp >> (EXP_WIDTH + 1)) != 0){
result >>= -result_exp;
// Note that no sticky bits are computed from eliminated bits
result_exp = EMIN;
}
smaller_factor = (abs_val1 < abs_val2) ? abs_val1 : abs_val2;
if (smaller_factor == 0) result_exp = EMIN;
discarded_bits = (result & FRAC_MASK<<carry) >> carry;
lfsr = floor(rand(_vectorisation_idx) * (1 << GUARD_WIDTH));
// MSB of truncated result is included so that it is added to exponent,
// which was calculated as the final value minus 1
trunc_result = result >> (GUARD_WIDTH+carry);
round_factor = (discarded_bits+lfsr) >> GUARD_WIDTH;
trunc_result = trunc_result + round_factor;
result_abs = (result_exp<<FRAC_WIDTH) + trunc_result;
// Dealing with overflow. Note that sign bit is used for comparison, so no
// extra space would be needed in hardware
if (result_abs > (1 << (N_BITS-1)) - 1) result_abs = (1 << (N_BITS-1)) - 1;
// Note that negative zeros are not return from operations
if (result_abs==0) result_sign = 0;
return (result_sign << (EXP_WIDTH+FRAC_WIDTH)) + result_abs;
}
"""
fp8_multiply_stochastic.implementations.add_implementation('cpp', cpp_code,
dependencies={'rand': DEFAULT_FUNCTIONS['rand'],
'floor': DEFAULT_FUNCTIONS['floor']})
fp8_multiply_stochastic.implementations.add_implementation('cuda', cuda_code,
dependencies={'rand': DEFAULT_FUNCTIONS['rand'],
'floor': DEFAULT_FUNCTIONS['floor']})
main_eqs = """
dv/dt = (1-v)/(10*ms) : 1
fp8_value = fp8_multiply_stochastic(55, t/ms) : 1
"""
neu = NeuronGroup(1, main_eqs, threshold='v>0.8', reset='v=0', method='exact')
mon = StateMonitor(neu, 'fp8_value', record=0)
run(50*ms)
plot(mon.t/ms, mon.fp8_value[0])
show()
I get the following error with cuda:
INFO CUDA installation directory detected via location of `nvcc` binary: /usr/local/cuda [brian2.devices.cuda_standalone]
INFO Compiling device code for GPU 0 (Tesla V100-SXM2-32GB) [brian2.devices.cuda_standalone]
ERROR Brian 2 encountered an unexpected error. If you think this is a bug in Brian 2, please report this issue either to the discourse forum at <http://brian.discourse.group/>, or to the issue tracker at <https://github.com/brian-team/brian2/issues>. Please include this file with debug information in your report: /jobfs/106996242.gadi-pbs/brian_debug_hdalyjex.log Additionally, you can also include a copy of the script that was run, available at: /jobfs/106996242.gadi-pbs/brian_script_fcav6s9u.py You can also include a copy of the redirected std stream outputs, available at '/jobfs/106996242.gadi-pbs/brian_stdout_sbullzm8.log' and '/jobfs/106996242.gadi-pbs/brian_stderr_9i2uyrbs.log'. Thanks! [brian2]
Traceback (most recent call last):
File "/home/590/pu6813/brian2-sims/func_test.py", line 253, in <module>
run(50*ms)
File "/home/590/pu6813/.local/lib/python3.10/site-packages/brian2/units/fundamentalunits.py", line 2780, in new_f
result = f(*args, **kwds)
File "/home/590/pu6813/.local/lib/python3.10/site-packages/brian2/core/magic.py", line 407, in run
return magic_network.run(
File "/home/590/pu6813/.local/lib/python3.10/site-packages/brian2/core/magic.py", line 248, in run
Network.run(
File "/home/590/pu6813/.local/lib/python3.10/site-packages/brian2/core/base.py", line 333, in device_override_decorated_function
return getattr(curdev, name)(*args, **kwds)
File "/home/590/pu6813/.local/lib/python3.10/site-packages/brian2cuda/device.py", line 1732, in network_run
self.build(direct_call=False, **self.build_options)
File "/home/590/pu6813/.local/lib/python3.10/site-packages/brian2cuda/device.py", line 1456, in build
self.generate_rand_source(self.writer)
File "/home/590/pu6813/.local/lib/python3.10/site-packages/brian2cuda/device.py", line 1072, in generate_rand_source
rand_tmp = self.code_object_class().templater.rand(None, None,
File "/home/590/pu6813/.local/lib/python3.10/site-packages/brian2/codegen/templates.py", line 253, in __call__
return MultiTemplate(module)
File "/home/590/pu6813/.local/lib/python3.10/site-packages/brian2/codegen/templates.py", line 270, in __init__
s = autoindent_postfilter(str(f()))
File "/home/590/pu6813/.local/lib/python3.10/site-packages/jinja2/runtime.py", line 763, in __call__
return self._invoke(arguments, autoescape)
File "/home/590/pu6813/.local/lib/python3.10/site-packages/jinja2/runtime.py", line 777, in _invoke
rv = self._func(*arguments)
File "/home/590/pu6813/.local/lib/python3.10/site-packages/brian2cuda/templates/rand.cu", line 99, in macro
num_per_gen_{{type}}_{{co.name}} = num_per_cycle_{{type}}_{{co.name}} * {{type}}_interval_{{co.name}};
File "/home/590/pu6813/.local/lib/python3.10/site-packages/jinja2/runtime.py", line 852, in _fail_with_undefined_error
raise self._undefined_exception(self._undefined_message)
jinja2.exceptions.UndefinedError: 'weakref.ProxyType object' has no attribute '_N'