|
| 1 | +import torch.nn as nn |
| 2 | +from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer |
| 3 | +from .utils.basemodel import BaseModel |
| 4 | +import torch.nn.init as nn_init |
| 5 | +import numpy as np |
| 6 | +from ..configs.autoint_config import DefaultAutoIntConfig |
| 7 | + |
| 8 | + |
| 9 | +class AutoInt(BaseModel): |
| 10 | + """ |
| 11 | + AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks. |
| 12 | +
|
| 13 | + This model uses multi-head self-attention layers to learn feature interactions for tabular data. |
| 14 | + It supports key-value compression for memory efficiency and is compatible with embedding-based |
| 15 | + feature encodings. |
| 16 | +
|
| 17 | + Parameters |
| 18 | + ---------- |
| 19 | + feature_information : tuple |
| 20 | + A tuple containing information about numerical features, categorical features, |
| 21 | + and any additional embeddings. Expected format: `(num_feature_info, cat_feature_info, embedding_feature_info)`. |
| 22 | + num_classes : int, default=1 |
| 23 | + Number of output classes. For regression, this should be set to `1`. |
| 24 | + config : DefaultAutoIntConfig, optional |
| 25 | + Configuration object containing hyperparameters such as `d_model`, `n_heads`, `n_layers`, |
| 26 | + dropout rates, and compression settings. |
| 27 | + **kwargs : dict |
| 28 | + Additional arguments passed to the `BaseModel`. |
| 29 | +
|
| 30 | + Attributes |
| 31 | + ---------- |
| 32 | + embedding_layer : EmbeddingLayer |
| 33 | + Module that processes numerical and categorical features into embeddings. |
| 34 | + kv_compression : float or None |
| 35 | + The proportion of key-value compression. If `None`, no compression is applied. |
| 36 | + kv_compression_sharing : str or None |
| 37 | + Defines how key-value compression is shared across layers. Options: |
| 38 | + - `"layerwise"`: One shared compression layer for all layers. |
| 39 | + - `"headwise"`: Separate key compression per head. |
| 40 | + - `"key-value"`: Separate compression layers for `k` and `v`. |
| 41 | + shared_kv_compression : nn.Linear or None |
| 42 | + Shared key-value compression layer, used when `kv_compression_sharing="layerwise"`. |
| 43 | + layers : nn.ModuleList |
| 44 | + A list of transformer-based attention layers, each consisting of: |
| 45 | + - `attention`: Multi-head self-attention module. |
| 46 | + - `linear`: Fully connected layer for projection. |
| 47 | + - `norm0`: Layer normalization. |
| 48 | + last_norm : nn.LayerNorm or None |
| 49 | + Final normalization layer applied before output if `prenormalization` is enabled. |
| 50 | + head : nn.Linear |
| 51 | + Output layer mapping from the processed feature representation to the final predictions. |
| 52 | + """ |
| 53 | + |
| 54 | + def __init__( |
| 55 | + self, |
| 56 | + feature_information: tuple, # (num_feature_info, cat_feature_info, embedding_feature_info) |
| 57 | + num_classes=1, |
| 58 | + config: DefaultAutoIntConfig = DefaultAutoIntConfig(), # noqa: B008 |
| 59 | + **kwargs, |
| 60 | + ): |
| 61 | + super().__init__(config=config, **kwargs) |
| 62 | + self.save_hyperparameters(ignore=["feature_information"]) |
| 63 | + self.returns_ensemble = False |
| 64 | + |
| 65 | + # Embedding layer |
| 66 | + self.embedding_layer = EmbeddingLayer(*feature_information, config=config) |
| 67 | + n_inputs = np.sum([len(info) for info in feature_information]) |
| 68 | + |
| 69 | + # Key-Value Compression |
| 70 | + self.kv_compression = config.kv_compression |
| 71 | + self.kv_compression_sharing = config.kv_compression_sharing |
| 72 | + |
| 73 | + def make_kv_compression(): |
| 74 | + compression = nn.Linear( |
| 75 | + n_inputs, |
| 76 | + int(n_inputs * config.kv_compression), |
| 77 | + bias=False, |
| 78 | + ) |
| 79 | + nn_init.xavier_uniform_(compression.weight) |
| 80 | + return compression |
| 81 | + |
| 82 | + self.shared_kv_compression = ( |
| 83 | + make_kv_compression() |
| 84 | + if self.kv_compression and self.kv_compression_sharing == "layerwise" |
| 85 | + else None |
| 86 | + ) |
| 87 | + |
| 88 | + # Transformer-based Interaction Layers |
| 89 | + self.layers = nn.ModuleList() |
| 90 | + for layer_idx in range(config.n_layers): |
| 91 | + layer = nn.ModuleDict( |
| 92 | + { |
| 93 | + "attention": nn.MultiheadAttention( |
| 94 | + embed_dim=config.d_model, |
| 95 | + num_heads=config.n_heads, |
| 96 | + dropout=config.attn_dropout, |
| 97 | + batch_first=True, |
| 98 | + ), |
| 99 | + "linear": nn.Linear(config.d_model, config.d_model, bias=False), |
| 100 | + "norm0": nn.LayerNorm(config.d_model), |
| 101 | + } |
| 102 | + ) |
| 103 | + |
| 104 | + if self.kv_compression and self.shared_kv_compression is None: |
| 105 | + layer["key_compression"] = make_kv_compression() |
| 106 | + if self.kv_compression_sharing == "headwise": |
| 107 | + layer["value_compression"] = make_kv_compression() |
| 108 | + else: |
| 109 | + assert self.kv_compression_sharing == "key-value" |
| 110 | + |
| 111 | + self.layers.append(layer) |
| 112 | + |
| 113 | + # Final Normalization & Output Head |
| 114 | + self.last_norm = ( |
| 115 | + nn.LayerNorm(config.d_model) if getattr(config, "prenorm", False) else None |
| 116 | + ) |
| 117 | + |
| 118 | + self.head = nn.Linear(config.d_model * n_inputs, num_classes) |
| 119 | + |
| 120 | + def _get_kv_compressions(self, layer): |
| 121 | + """ |
| 122 | + Returns the correct key-value compression layers based on the sharing strategy. |
| 123 | +
|
| 124 | + Parameters |
| 125 | + ---------- |
| 126 | + layer : nn.ModuleDict |
| 127 | + The transformer layer containing possible key-value compression modules. |
| 128 | +
|
| 129 | + Returns |
| 130 | + ------- |
| 131 | + tuple of (nn.Linear or None, nn.Linear or None) |
| 132 | + The key compression and value compression layers, or `(None, None)` if no compression is applied. |
| 133 | + """ |
| 134 | + return ( |
| 135 | + (self.shared_kv_compression, self.shared_kv_compression) |
| 136 | + if self.shared_kv_compression is not None |
| 137 | + else ( |
| 138 | + (layer["key_compression"], layer["value_compression"]) |
| 139 | + if "key_compression" in layer and "value_compression" in layer |
| 140 | + else ( |
| 141 | + (layer["key_compression"], layer["key_compression"]) |
| 142 | + if "key_compression" in layer |
| 143 | + else (None, None) |
| 144 | + ) |
| 145 | + ) |
| 146 | + ) |
| 147 | + |
| 148 | + def forward(self, *data): |
| 149 | + """ |
| 150 | + Forward pass of the AutoInt model. |
| 151 | +
|
| 152 | + Parameters |
| 153 | + ---------- |
| 154 | + *data : tuple |
| 155 | + Input tuple of tensors containing numerical features, categorical features, and embeddings. |
| 156 | +
|
| 157 | + Returns |
| 158 | + ------- |
| 159 | + Tensor |
| 160 | + The output predictions of the model. |
| 161 | + """ |
| 162 | + x = self.embedding_layer(*data) # Shape: (N, J, d_model) |
| 163 | + |
| 164 | + for layer in self.layers: |
| 165 | + x_residual = x # Store original input for residual connection |
| 166 | + |
| 167 | + # Apply normalization before attention if prenormalization is enabled |
| 168 | + x_residual = layer["norm0"](x_residual) |
| 169 | + |
| 170 | + # Retrieve key-value compression layers |
| 171 | + key_compression, value_compression = self._get_kv_compressions(layer) |
| 172 | + |
| 173 | + # Multihead Attention |
| 174 | + x_residual, _ = layer["attention"](x_residual, x_residual, x_residual) |
| 175 | + |
| 176 | + # Apply residual connection |
| 177 | + x = x + x_residual |
| 178 | + |
| 179 | + # Apply the linear transformation |
| 180 | + x_residual = layer["linear"](x) |
| 181 | + x = x + x_residual # Second residual connection |
| 182 | + |
| 183 | + if self.last_norm: |
| 184 | + x = self.last_norm(x) # Final normalization if prenormalization is used |
| 185 | + |
| 186 | + x = x.flatten(1) # Flatten from (N, J, d_model) to (N, J * d_model) |
| 187 | + return self.head(x) # Final prediction |
0 commit comments