Functions¶
-
Variable
fl
::
operator+
(const Variable &lhs, const Variable &rhs)¶ Element-wise addition of two Variables.
\[ out = var_1 + var_2 \]
-
Variable
fl
::
operator+
(const double &lhs, const Variable &rhs)¶ Adds a scalar to each element in the Variable.
\[ out_i = value + var_i \]
-
Variable
fl
::
operator+
(const Variable &lhs, const double &rhs)¶ Adds a scalar to each element in the Variable.
\[ out_i = var_i + value \]
-
Variable
fl
::
operator*
(const Variable &lhs, const Variable &rhs)¶ Element-wise multiplication of two Variables.
\[ out = var_1 \times var_2 \]
-
Variable
fl
::
operator*
(const double &lhs, const Variable &rhs)¶ Multiplies each element in the Variable by a scalar.
\[ out_i = value \times var_i \]
-
Variable
fl
::
operator*
(const Variable &lhs, const double &rhs)¶ Multiplies each element in the Variable by a scalar.
\[ out_i = var_i \times value \]
-
Variable
fl
::
operator-
(const Variable &lhs, const Variable &rhs)¶ Element-wise subtraction of two Variables.
\[ out = var_1 - var_2 \]
-
Variable
fl
::
operator-
(const double &lhs, const Variable &rhs)¶ Subtracts a scalar from each element in the Variable.
\[ out_i = var_i - value \]
-
Variable
fl
::
operator-
(const Variable &lhs, const double &rhs)¶ Subtracts each element in the Variable from a scalar.
\[ out_i = value - var_i \]
-
Variable
fl
::
operator/
(const Variable &lhs, const Variable &rhs)¶ Element-wise division of two Variables.
\[ out = \frac{var_1}{var_2} \]
-
Variable
fl
::
operator/
(const double &lhs, const Variable &rhs)¶ Divides each element in the Variable by a scalar.
\[ out_i = \frac{var_i}{value} \]
-
Variable
fl
::
operator/
(const Variable &lhs, const double &rhs)¶ Divides a scalar by each element in the Variable.
\[ out_i = \frac{value}{var_i} \]
-
Variable
fl
::
operator>
(const Variable &lhs, const Variable &rhs)¶ [Non-differentiable] Element-wise comparison of two Variables.
\[ out = var_1 > var_2 \]
-
Variable
fl
::
operator>
(const double &lhs, const Variable &rhs)¶ [Non-differentiable] Element-wise comparison of a Variable and a scalar.
\[ out_i = value > var_i \]
-
Variable
fl
::
operator>
(const Variable &lhs, const double &rhs)¶ [Non-differentiable] Element-wise comparison of a Variable and a scalar.
\[ out_i = var_i > value \]
-
Variable
fl
::
operator<
(const Variable &lhs, const Variable &rhs)¶ [Non-differentiable] Element-wise comparison of two Variables.
\[ out = var_1 < var_2 \]
-
Variable
fl
::
operator<
(const double &lhs, const Variable &rhs)¶ [Non-differentiable] Element-wise comparison of a Variable and a scalar.
\[ out_i = value < var_i \]
-
Variable
fl
::
operator<
(const Variable &lhs, const double &rhs)¶ [Non-differentiable] Element-wise comparison of a Variable and a scalar.
\[ out_i = var_i < value \]
-
Variable
fl
::
operator>=
(const Variable &lhs, const Variable &rhs)¶ [Non-differentiable] Element-wise comparison of two Variables.
\[ out = var_1 >= var_2 \]
-
Variable
fl
::
operator>=
(const double &lhs, const Variable &rhs)¶ [Non-differentiable] Element-wise comparison of a Variable and a scalar.
\[ out_i = value >= var_i \]
-
Variable
fl
::
operator>=
(const Variable &lhs, const double &rhs)¶ [Non-differentiable] Element-wise comparison of a Variable and a scalar.
\[ out_i = var_i >= value \]
-
Variable
fl
::
operator<=
(const Variable &lhs, const Variable &rhs)¶ [Non-differentiable] Element-wise comparison of two Variables.
\[ out = var_1 <= var_2 \]
-
Variable
fl
::
operator<=
(const double &lhs, const Variable &rhs)¶ [Non-differentiable] Element-wise comparison of a Variable and a scalar.
\[ out_i = value <= var_i \]
-
Variable
fl
::
operator<=
(const Variable &lhs, const double &rhs)¶ [Non-differentiable] Element-wise comparison of a Variable and a scalar.
\[ out_i = value <= var_i \]
-
Variable
fl
::
operator&&
(const Variable &lhs, const Variable &rhs)¶ [Non-differentiable] Element-wise logical and of two Variables.
\[ out = var_1 \& var_2 \]
-
Variable
fl
::
operator!
(const Variable &input)¶ [Non-differentiable] Element-wise logical not of a Variable.
\[ out_i = !var_i \]
-
Variable
fl
::
negate
(const Variable &input)¶ Computes negative of each element in a Variable.
\[ out_i = -var_i \]
-
Variable
fl
::
reciprocal
(const Variable &input)¶ Computes reciprocal of each element in a Variable.
\[ out_i = \frac{1}{var_i} \]
-
Variable
fl
::
exp
(const Variable &input)¶ Computes exponential of each element in a Variable.
\[ out_i = e^{var_i} \]
-
Variable
fl
::
log
(const Variable &input)¶ Computes natural logarithm of each element in a Variable.
\[ out_i = log(var_i) \]
-
Variable
fl
::
pow
(const Variable &input, double p)¶ Computes power of each element in a Variable.
\[ out_i = var_i^p \]
-
Variable
fl
::
log1p
(const Variable &input)¶ Computes natural logarithm of (1 + element) for each element in a Variable.
\[ out_i = log(1.0 + var_i) \]
-
Variable
fl
::
sin
(const Variable &input)¶ Computes sine of each element in a Variable.
\[ out_i = sin(var_i) \]
-
Variable
fl
::
cos
(const Variable &input)¶ Computes cosine of each element in a Variable.
\[ out_i = cos(var_i) \]
-
Variable
fl
::
sqrt
(const Variable &input)¶ Computes square root of each element in a Variable.
\[ out_i = \sqrt{var_i} \]
-
Variable
fl
::
tanh
(const Variable &input)¶ Computes hyperbolic tangent of each element in a Variable.
\[ out_i = \frac{\exp(var_i) - \exp(-var_i)}{\exp(var_i) + \exp(-var_i)} \]
-
Variable
fl
::
clamp
(const Variable &input, const double min, const double max)¶ Clamps all elements in input into the range [
min
,max
] and return a resulting tensor:\[\begin{split} \begin{split}y_i = \begin{cases} \text{min} & \text{if } x_i < \text{min} \\ x_i & \text{if } \text{min} \leq x_i \leq \text{max} \\ \text{max} & \text{if } x_i > \text{max} \end{cases}\end{split} \end{split}\].
-
Variable
fl
::
sigmoid
(const Variable &input)¶ Computes sigmoid of each element in a Variable.
\[ out_i = \frac{1}{1 + \exp(-var_i)} \]
-
Variable
fl
::
swish
(const Variable &input, double beta)¶ Computes swish of each element in a Variable from Ramachandran et al (2013), Searching for Activation Functions.
\[ Swish(x) = x \cdot sigmoid(\beta x) \]where \(\beta\) is a constant, often is 1.
-
Variable
fl
::
erf
(const Variable &input)¶ Computes the error function of each element in a Variable from as in here
-
Variable
fl
::
max
(const Variable &lhs, const Variable &rhs)¶ Returns element-wise maximum value of two Variables.
\[ out = max(var_1, var_2) \]
-
Variable
fl
::
max
(const Variable &lhs, const double &rhs)¶ Returns maximum value of a scalar and each element in a Variable.
\[ out_i = max(var_i, value) \]
-
Variable
fl
::
max
(const double &lhs, const Variable &rhs)¶ Returns maximum value of a scalar and each element in a Variable.
\[ out_i = max(value, var_i) \]
-
Variable
fl
::
min
(const Variable &lhs, const Variable &rhs)¶ Returns element-wise minimum value of two Variables.
\[ out = min(var_1, var_2) \]
-
Variable
fl
::
min
(const Variable &lhs, const double &rhs)¶ Returns minimum value of a scalar and each element in a Variable.
\[ out_i = min(var_i, value) \]
-
Variable
fl
::
min
(const double &lhs, const Variable &rhs)¶ Returns minimum value of a scalar and each element in a Variable.
\[ out_i = min(value, var_i) \]
-
Variable
fl
::
transpose
(const Variable &input, const Shape &dims = {})¶ Returns a tensor that is a transposed version of a Variable.
Reorders the input based on the shape of the input dimensions. If left empty, transposes over all dimensinos (reverses all dimensions).
-
Variable
fl
::
tileAs
(const Variable &input, const Variable &reference)¶ Repeats the tensor
input
along certain dimensions so as to match the shape ofreference
.The dimensions to be repeated along are automatically inferred.
-
Variable
fl
::
tileAs
(const Variable &input, const Shape &rdims)¶ Repeats the tensor
input
along certain dimensions so as to match the shape in the descriptorrdims
.The dimensions to be repeated along are automatically inferred.
-
Variable
fl
::
sumAs
(const Variable &input, const Variable &reference)¶ Sums up the tensor
input
along certain dimensions so as to match the shape ofreference
.The dimensions to be summed along are automatically inferred. Note that after summation, the shape of those dimensions will be 1.
-
Variable
fl
::
sumAs
(const Variable &input, const Shape &rdims)¶ Sums up the tensor
input
along certain dimensions so as to match the shape in the descriptorrdims
.The dimensions to be summed along are automatically inferred. Note that after summation, the shape of those dimensions will be 1.
-
Variable
fl
::
concatenate
(const std::vector<Variable> &concatInputs, int dim)¶ Concatenates Variables along a specific dimension.
The shape of input Variables should be identical except the dimension to concatenate.
-
std::vector<Variable>
fl
::
split
(const Variable &input, long splitSize, int dim)¶ Splits a Variable into equally sized chunks (if possible)
-
std::vector<Variable>
fl
::
split
(const Variable &input, const std::vector<long> &splitSizes, int dim)¶ Splits a Variable into smaller chunks.
-
Variable
fl
::
tile
(const Variable &input, const Shape &dims)¶ Repeats the tensor
input
along specific dimensions.The number of repetition along each dimension is specified in descriptor
dims
.
-
Variable
fl
::
tile
(const Variable &input, const Shape &dims, const fl::dtype precision)¶ Repeats the tensor
input
along specific dimensions.The number of repetition along each dimension is specified in descriptor
dims
.- Parameters
[in] precision
: Type of the output vector when is it is desired to be different from the input type. This is particularly useful when tile is applied on parameters and the results will be used in a half precision arithmetic.
-
Variable
fl
::
sum
(const Variable &input, const std::vector<int> &axes, bool keepDims = false)¶ Sums up the tensors
input
along dimensions specified in descriptoraxes
.If
axes
has size greater than 1, reduce over all of them.
-
Variable
fl
::
mean
(const Variable &input, const std::vector<int> &axes, bool keepDims = false)¶ Computes the mean of the tensor
input
along dimensions specified in descriptoraxes
.If
axes
has size greater than 1, reduce over all of them.
-
Variable
fl
::
norm
(const Variable &input, const std::vector<int> &axes, double p = 2, bool keepDims = false)¶ Lp-norm computation, reduced over specified dimensions.
- Parameters
input
: tensor on which the Lp norm is going to be computed.p
: the p value of the Lp norm.axes
: dimensions over which the reduction is performed.
-
Variable
fl
::
normalize
(const Variable &input, const std::vector<int> &axes, double p = 2, double eps = 1e-12)¶ Lp norm normalization of values across the given dimensions.
- Parameters
input
: the tensor to be normalized.axes
: dimensions over which values are normalized.p
: the p value of the Lp norm.eps
: clamping value to avoid overflows.
-
Variable
fl
::
var
(const Variable &input, const std::vector<int> &axes, const bool isbiased = false, bool keepDims = false)¶ Computes variance of the tensor
input
along dimensions specified in descriptoraxes
.If
axes
has size greater than 1, reduce over all of them. Uses population variance ifisbiased
istrue
, otherwise, uses sample variance.NB: the behavior of
fl::var
differs from that ofaf::var
. In ArrayFire versions >= 3.7.0, ifisbiased
istrue
the variance computation uses sample variance; iffalse
, population variance is used. For versions of ArrayFire before v3.7.0, the reverse is true. TODO:{fl::Tensor} make this behavior consistent
-
Variable
fl
::
matmul
(const Variable &lhs, const Variable &rhs)¶ Conducts matrix-matrix multiplication on two Variables.
This is a batched function if \(B_1\) or \(B_2\) is greater than 1.
-
Variable
fl
::
matmulTN
(const Variable &lhs, const Variable &rhs)¶ Conducts matrix-matrix multiplication on two Variables, where the first one will be transposed before multiplication.
This is a batched function if \(B_1\) or \(B_2\) is greater than 1.
-
Variable
fl
::
matmulNT
(const Variable &lhs, const Variable &rhs)¶ Conducts matrix-matrix multiplication on two Variables, where the second one will be transposed before multiplication.
This is a batched function if \(B_1\) or \(B_2\) is greater than 1.
-
Variable
fl
::
abs
(const Variable &input)¶ Returns the absolute values of each element in a Variable.
\[ out_i = |var_i| \]
-
Variable
fl
::
moddims
(const Variable &input, const Shape &dims)¶ Modifies the input dimensions without changing the data order.
The shape of the output Variable is specified in descriptor
dims
.
-
Variable
fl
::
reorder
(const Variable &input, const Shape &shape)¶ Exchanges data of an array such that the requested change in dimension is satisfied.
The linear ordering of data within the array is preserved.
-
Variable
fl
::
linear
(const Variable &input, const Variable &weight)¶ Applies a linear transformation to the input Variable:
\[ y = Ax \].
-
Variable
fl
::
linear
(const Variable &input, const Variable &weight, const Variable &bias)¶ Applies a linear transformation to the input Variable:
\[ y = Ax + b \].
Applies a 2D convolution over an input signal given filter weights.
In the simplest case, the output with shape [ \(X_{out}\), \(Y_{out}\), \(C_{out}\), \(N\)] of the convolution with input [ \(X_{in}\), \(Y_{in}\), \(C_{in}\), \(N\)] and weight [ \(K_x\), \(K_y\), \(C_{in}\), \(C_{out}\)] can be precisely described as:
\[ \text{out}(C_{out_j}, N_i) = \sum_{k = 0}^{C_{in} - 1} \text{weight}(k, C_{out_j}) \star \text{input}(k, N_i) \]- Return
a Variable with shape [ \(X_{out}\), \(Y_{out}\), \(C_{out}\), \(N\)]]
- Parameters
input
: a Variable with shape [ \(X_{in}\), \(Y_{in}\), \(C_{in}\), \(N\)]weights
: a Variable with shape [ \(K_x\), \(K_y\), \(C_{in}\), \(C_{out}\)]sx
: stride in the first dimensionsy
: stride in the second dimensionpx
: number of positions of zero-padding on both sides in the first dimensionpy
: number of positions of zero-padding on both sides in the second dimensiondx
: dilation along the first kernel dimension. A dilation of 1 is equivalent to a standard convolution along this axis.dy
: dilation along the second kernel dimension. A dilation of 1 is equivalent to a standard convolution along this axis.groups
: number of filter groupsbenchmarks
: [optional] aConvBenchmarks
instance to use to dynamically benchmark configuration attributes for computations.
Applies a 2D convolution over an input signal given filter weights and biases.
In the simplest case, the output with shape [ \(X_{out}\), \(Y_{out}\), \(C_{out}\), \(N\)] of the convolution with input [ \(X_{in}\), \(Y_{in}\), \(C_{in}\), \(N\)] and weight [ \(K_x\), \(K_y\), \(C_{in}\), \(C_{out}\)] can be precisely described as:
\[ \text{out}(C_{out_j}, N_i) = \text{bias}(C_{out_j}) + \sum_{k = 0}^{C_{in} - 1} \text{weight}(k, C_{out_j}) \star \text{input}(k, N_i) \]- Return
a Variable with shape [ \(X_{out}\), \(Y_{out}\), \(C_{out}\), \(N\)]]
- Parameters
input
: a Variable with shape [ \(X_{in}\), \(Y_{in}\), \(C_{in}\), \(N\)]weights
: a Variable with shape [ \(K_x\), \(K_y\), \(C_{in}\), \(C_{out}\)]sx
: stride in the first dimensionsy
: stride in the second dimensionpx
: number of positions of zero-padding on both sides in the first dimensionpy
: number of positions of zero-padding on both sides in the second dimensiondx
: dilation along the first kernel dimension. A dilation of 1 is equivalent to a standard convolution along this axis.dy
: dilation along the second kernel dimension. A dilation of 1 is equivalent to a standard convolution along this axis.groups
: number of filter groupsbenchmarks
: [optional] aConvBenchmarks
instance to use to dynamically benchmark configuration attributes for computations.bias
: a Variable with shape [ \(C_{out}\)]
-
Variable
fl
::
pool2d
(const Variable &input, int wx, int wy, int sx = 1, int sy = 1, int px = 0, int py = 0, PoolingMode mode = PoolingMode::MAX)¶ Applies a 2D pooling over an input signal composed of several input planes.
- Parameters
input
: a Variable with shape [ \(X_{in}\), \(Y_{in}\), \(C\), \(N\)]wx
: pooling window size in the first dimensionwy
: pooling window size in the second dimensionsx
: stride in the first dimensionsy
: stride in the second dimensionpx
: number of positions of zero-padding on both sides in the first dimensionpy
: number of positions of zero-padding on both sides in the second dimensionmode
: pooling mode, which supports:MAX
AVG_INCLUDE_PADDING
AVG_EXCLUDE_PADDING
-
Variable
fl
::
softmax
(const Variable &input, const int dim)¶ Applies a softmax function on Variable
input
along dimensiondim
, so that the elements of the dimensionaldim
in output lie in the range (0,1) and sum to 1.\[ out(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)} \]
-
Variable
fl
::
logSoftmax
(const Variable &input, const int dim)¶ Applies a log(softmax(x)) function on Variable
input
along dimensiondim
\[ out(x_{i}) = log \Big( \frac{exp(x_i)}{\sum_j exp(x_j)} \Big) \].
-
Variable
fl
::
binaryCrossEntropy
(const Variable &inputs, const Variable &targets)¶ Computes the binary cross entropy loss between an input tensor \(x\) and a target tensor \(y\).
The binary cross entropy loss is:
\[ B(x, y) = \frac{1}{n} \sum_{i = 0}^n -\left( y_i \times \log(x_i) + (1 - y_i) \times \log(1 - x_i) \right) \]Both the inputs and the targets are expected to be between 0 and 1.- Parameters
inputs
: a tensor with the predicted valuestargets
: a tensor with the target values
-
Variable
fl
::
categoricalCrossEntropy
(const Variable &input, const Variable &targets, ReduceMode reduction = ReduceMode::MEAN, int ignoreIndex = -1)¶ Computes the categorical cross entropy loss.
The input is expected to contain log-probabilities for each class. The targets should be the index of the ground truth class for each input example.
\[\begin{split} \begin{split}\ell(x, y) = \begin{cases} \frac{1}{N} \sum_{n=1}^N -x_{n,y_n}, & \text{if}\; \text{reduction} = \text{MEAN},\\ \sum_{n=1}^N -x_{n,y_n}, & \text{if}\; \text{reduction} = \text{SUM}, \\ \{ -x_{1,y_1}, ..., -x_{N,y_N} \}, & \text{if}\; \text{reduction} = \text{NONE}. \end{cases}\end{split} \end{split}\]- Return
a
Variable
of loss value with shape scalar by default. Ifreduce
is NONE, then [ \(B_1\), \(B_2\), \(B_3\)].- Parameters
input
: aVariable
with shape [ \(C\), \(B_1\), \(B_2\), \(B_3\)] where \(C\) is the number of classes.targets
: an integerVariable
with shape [ \(B_1\), \(B_2\), \(B_3\)]. The values must be in \([0, C - 1]\)reduction
: reduction mode, which supports:NONE
MEAN
SUM
ignoreIndex
: a target value that is ignored and does not contribute to the loss or the input gradient. Ifreduce
is MEAN, the loss is averaged over non-ignored targets. Only indicies in \([0, C - 1]\) are considered to be valid.
-
Variable
fl
::
weightedCategoricalCrossEntropy
(const Variable &input, const Variable &targets, const Variable &weight, int ignoreIndex)¶ Computes the weighted cross entropy loss.
The input is expected to contain log-probabilities for each class. The targets should be the index of the ground truth class for each input example.
\[ \ell(x, y) = \frac{\sum_{n=1}^N weight[y_n] * -x_{n,y_n}}\; {\sum_{n=1}^N weight[y_n] \]- Return
a
Variable
of loss value with shape scalar by default.- Parameters
input
: aVariable
with shape [ \(C\), \(B_1\), \(B_2\), \(B_3\)] where \(C\) is the number of classes.targets
: an integerVariable
with shape [ \(B_1\), \(B_2\), The values must be in \([0, C - 1]\).weights
: anVariable
with shape f$[0, C - 1].ignoreIndex
: a target value that is ignored and does not contribute to the loss or the input gradient. Ifreduce
is MEAN, the loss is averaged over non-ignored targets. Only indicies in \([0, C - 1]\) are considered to be valid.
-
Variable
fl
::
gatedlinearunit
(const Variable &input, const int dim)¶ The gated linear unit.
\[ H = A \times \sigma(B) \]whereinput
is split in half alongdim
to formA
andB
. See Language Modeling with Gated Convolutional Networks.- Parameters
input
: input Variabledim
: dimension on which to split the input
-
std::tuple<Variable, Variable, Variable>
fl
::
rnn
(const Variable &input, const Variable &hiddenState, const Variable &cellState, const Variable &weights, int hiddenSize, int numLayers, RnnMode mode, bool bidirectional, float dropout)¶ Applies an RNN unit to an input sequence.
A general RNN operator can be expressed as following:
\[ (h_t, c_t) = f_W(x_t, h_{t-1}, c_{t-1}) \]where \(h_t\), \(c_t\) are the hidden/cell state at time \(t\), \(x_t\) is the input at time \(t\)- Note
{cuDNN and oneDNN RNN weights are incompatible since the structure of the computation is different for each. There is no mapping between weights from each of those backends.}
- Return
a tuple of three Variables:
y
: input with shape [input size, batch size, sequence length * directions]hiddenState
: hidden state for the current time stepcellState
: cell state for the current time step
- Parameters
input
: Variable of input with shape [input size, batch size, sequence length]hiddenState
: Variable of hidden state with shape [hidden size, batch size, total layers]cellState
: [LSTM only] Variable of cell state with same shape as hidden stateweights
: Learnable parameters in the RNN unithiddenSize
: number of features in the hidden statenumLayers
: number of recurrent layersmode
: defines the type of RNN unitRELU
TANH
LSTM
GRU
bidirectional
: ifTrue
, becomes a bidirectional RNN, unidirectional otherwisedropout
: if non-zero, introduces aDropout
layer on the outputs of each RNN layer except the last one, with dropout probability equal to dropout
-
Variable
fl
::
embedding
(const Variable &input, const Variable &embeddings)¶ Looks up embeddings in a fixed dictionary and size.
- Return
a Variable of embeddings with shape [ \(D\), \(B_1\), \(B_2\), \(B_3\)]
- Parameters
-
Variable
fl
::
batchnorm
(const Variable &input, const Variable &weight, const Variable &bias, Variable &runningMean, Variable &runningVar, const std::vector<int> &axes, bool train, double momentum, double epsilon)¶ Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
\[ y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta \]The mean and standard-deviation are calculated per-dimension over the mini-batches and \(\gamma\) and \(\beta\) are learnable parameter vectors of size \(C\), the input size. By default, during training this layer keeps running estimates of its computed mean and variance, which are then used for normalization during evaluation.- Return
a Variable with same shape as
input
- Parameters
input
: a Variable with size [ \(H\), \(W\), \(C\), \(N\)]weight
: a Variable with size [ \(C\)] for \(\gamma\)bias
: a Variable with size [ \(C\)] for \(\beta\)runningMean
: a buffer storing intermediate means during trainingrunningVar
: a buffer storing intermediate variances during trainingaxes
: dimensions to perform normalization on. If having size greater than one, reduce over all of them.train
: a flag indicating if running in training modemomentum
: value of momentumepsilon
: value of \(\epsilon\)
-
Variable
fl
::
padding
(const Variable &input, std::vector<std::pair<int, int>> pad, double val)¶ Applies asymmetric padding on a Variable
input
.
-
Variable
fl
::
relu
(const Variable &input)¶ Applies the rectified linear unit function element-wise to a
Variable
:\[ ReLU(x) = \max(0, x) \].
-
Variable
fl
::
gelu
(const Variable &input)¶ Applies the Gaussian Error linear Unit function element-wise to a
Variable
-
Variable
fl
::
relativePositionalEmbeddingRotate
(const Variable &input)¶ Relative positional embedding for the multihead attention Implementation partially follows https://arxiv.org/pdf/1803.02155.pdf.
-
Variable
fl
::
multiheadAttention
(const Variable &query, const Variable &key, const Variable &value, const Variable &posEmb, const Variable &mask, const Variable &padMask, const int32_t nHeads, const double pDropout, const int32_t offset = 0)¶ Multihead Attention function For details, see Vaswani et al (2017).
- Parameters
query
: query Variable of size T x nHeads * headDim x Bkey
: key Variable of size Time x nHeads * headDim x Bvalue
: value Variable of size Time x nHeads * headDim x BposEmb
: if non empty then compute relative positional embedding in additon to standard computationsmask
: mask or not future in the computations T x T if non-empty then don’t use future (for example for autoregressive language models or for decoder part in the encoder-decoder transformer models)padMask
: mask which is 1 for positions where pad token is, don’t attend to the pad-positions, of size T x BnHeads
: number of headspDropout
: dropout probabilityoffset
: size of the current output from the decoder used now as input