|
4 | 4 |
|
5 | 5 | @dataclass |
6 | 6 | class DefaultMambularConfig: |
| 7 | + """ |
| 8 | + Configuration class for the Default Mambular model with predefined hyperparameters. |
| 9 | +
|
| 10 | + Parameters |
| 11 | + ---------- |
| 12 | + lr : float, default=1e-04 |
| 13 | + Learning rate for the optimizer. |
| 14 | + lr_patience : int, default=10 |
| 15 | + Number of epochs with no improvement after which learning rate will be reduced. |
| 16 | + weight_decay : float, default=1e-06 |
| 17 | + Weight decay (L2 penalty) for the optimizer. |
| 18 | + lr_factor : float, default=0.1 |
| 19 | + Factor by which the learning rate will be reduced. |
| 20 | + d_model : int, default=64 |
| 21 | + Dimensionality of the model. |
| 22 | + n_layers : int, default=8 |
| 23 | + Number of layers in the model. |
| 24 | + expand_factor : int, default=2 |
| 25 | + Expansion factor for the feed-forward layers. |
| 26 | + bias : bool, default=False |
| 27 | + Whether to use bias in the linear layers. |
| 28 | + d_conv : int, default=16 |
| 29 | + Dimensionality of the convolutional layers. |
| 30 | + conv_bias : bool, default=True |
| 31 | + Whether to use bias in the convolutional layers. |
| 32 | + dropout : float, default=0.05 |
| 33 | + Dropout rate for regularization. |
| 34 | + dt_rank : str, default="auto" |
| 35 | + Rank of the decision tree. |
| 36 | + d_state : int, default=32 |
| 37 | + Dimensionality of the state in recurrent layers. |
| 38 | + dt_scale : float, default=1.0 |
| 39 | + Scaling factor for decision tree. |
| 40 | + dt_init : str, default="random" |
| 41 | + Initialization method for decision tree. |
| 42 | + dt_max : float, default=0.1 |
| 43 | + Maximum value for decision tree initialization. |
| 44 | + dt_min : float, default=1e-04 |
| 45 | + Minimum value for decision tree initialization. |
| 46 | + dt_init_floor : float, default=1e-04 |
| 47 | + Floor value for decision tree initialization. |
| 48 | + norm : str, default="RMSNorm" |
| 49 | + Normalization method to be used. |
| 50 | + activation : callable, default=nn.SELU() |
| 51 | + Activation function for the model. |
| 52 | + num_embedding_activation : callable, default=nn.Identity() |
| 53 | + Activation function for numerical embeddings. |
| 54 | + head_layer_sizes : list, default=(128, 64, 32) |
| 55 | + Sizes of the layers in the head of the model. |
| 56 | + head_dropout : float, default=0.5 |
| 57 | + Dropout rate for the head layers. |
| 58 | + head_skip_layers : bool, default=False |
| 59 | + Whether to skip layers in the head. |
| 60 | + head_activation : callable, default=nn.SELU() |
| 61 | + Activation function for the head layers. |
| 62 | + head_use_batch_norm : bool, default=False |
| 63 | + Whether to use batch normalization in the head layers. |
| 64 | + layer_norm_after_embedding : bool, default=False |
| 65 | + Whether to apply layer normalization after embedding. |
| 66 | + pooling_method : str, default="avg" |
| 67 | + Pooling method to be used ('avg', 'max', etc.). |
| 68 | + bidirectional : bool, default=False |
| 69 | + Whether to use bidirectional processing of the input sequences. |
| 70 | + use_learnable_interaction : bool, default=False |
| 71 | + Whether to use learnable feature interactions before passing through mamba blocks. |
| 72 | + """ |
| 73 | + |
7 | 74 | lr: float = 1e-04 |
8 | 75 | lr_patience: int = 10 |
9 | 76 | weight_decay: float = 1e-06 |
|
0 commit comments