1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
| from typing import Optional
import torch from torch import Tensor from torch.nn.parameter import Parameter
from .module import Module from .. import functional as F from .. import init
__all__ = ['Embedding', 'EmbeddingBag']
class Embedding(Module): r"""一个简单的查找表,存储固定字典和大小的嵌入。 该模块通常用于存储词嵌入并使用索引检索它们。 该模块的输入是索引列表,输出是相应的词嵌入。
Args: num_embeddings (int): 嵌入词典的大小 embedding_dim (int): 每个嵌入向量的大小 padding_idx (int, optional): 如果指定,`padding_idx` 处的条目不会对梯度产生贡献; 因此,`padding_idx` 处的嵌入向量在训练期间不会更新, 即它仍为固定的“填充”。对于新构建的嵌入, :attr:`padding_idx` 处的嵌入向量将默认为全零, 但可以更新为其他值以用作填充向量。 max_norm (float, optional): 如果给定,则每个范数大于`max_norm` 的嵌入向量将被重新规范化以具有范数`max_norm`。 norm_type (float, optional): 用于计算 `max_norm` 选项的 p 范数的 p。默认值为 ``2``。 scale_grad_by_freq (bool, optional): 如果指定,这将按小批量中单词频率的倒数缩放梯度。默认为“False”。 sparse (bool, optional): 如果为“True”,则相对于 “weight” 矩阵的梯度将为稀疏张量。
Attributes: weight (Tensor): 形状为 (num_embeddings, embedding_dim) 的模块的可学习权重 从 :math:`\mathcal{N}(0, 1)` 初始化
Shape: - Input: :math:`(*)`, 包含要提取的索引的任意形状的 IntTensor 或 LongTensor - Output: :math:`(*, H)`, 其中 `*` 是输入形状,math:`H=\text{embedding\_dim}` .. note:: 请记住,只有有限数量的优化器支持稀疏梯度:目前是:class:`optim.SGD`(`CUDA` 和 `CPU`)、 :class:`optim.SparseAdam`(`CUDA` 和 `CPU`)和:class:`optim.Adagrad`(`CPU`)
.. note:: 当 `max_norm` 不为 ``None`` 时,:class:`Embedding` 的 forward 方法将就地修改 `weight` 张量。由于梯度计算所需的张量无法就地修改,因此在调用 :class:`Embedding` 的 forward 方法之前对 ``Embedding.weight`` 执行可微分运算需要在 `max_norm` 不为 ``None`` 时克隆 ``Embedding.weight``。 For example::
n, d, m = 3, 5, 7 embedding = nn.Embedding(n, d, max_norm=True) W = torch.randn((m, d), requires_grad=True) idx = torch.tensor([1, 2]) a = embedding.weight.clone() @ W.t() # weight must be cloned for this to be differentiable b = embedding(idx) @ W.t() # modifies weight in-place out = (a.unsqueeze(0) + b.unsqueeze(1)) loss = out.sigmoid().prod() loss.backward()
Examples::
>>> # 一个包含 10 个大小为 3 的张量的嵌入模块 >>> embedding = nn.Embedding(10, 3) >>> # 一批 2 个样本,每个样本包含 4 个索引 >>> input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]) >>> # xdoctest: +IGNORE_WANT("non-deterministic") >>> embedding(input) tensor([[[-0.0251, -1.6902, 0.7172], [-0.6431, 0.0748, 0.6969], [ 1.4970, 1.3448, -0.9685], [-0.3677, -2.7265, -0.1685]],
[[ 1.4970, 1.3448, -0.9685], [ 0.4362, -0.4004, 0.9400], [-0.6431, 0.0748, 0.6969], [ 0.9124, -2.3616, 1.1151]]])
>>> # 带有 padding_idx 的示例 >>> embedding = nn.Embedding(10, 3, padding_idx=0) >>> input = torch.LongTensor([[0, 2, 0, 5]]) >>> embedding(input) tensor([[[ 0.0000, 0.0000, 0.0000], [ 0.1535, -2.0309, 0.9315], [ 0.0000, 0.0000, 0.0000], [-0.1655, 0.9897, 0.0635]]])
>>> # 改变 `pad` 向量的示例 >>> padding_idx = 0 >>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx) >>> embedding.weight Parameter containing: tensor([[ 0.0000, 0.0000, 0.0000], [-0.7895, -0.7089, -0.0364], [ 0.6778, 0.5803, 0.2678]], requires_grad=True) >>> with torch.no_grad(): ... embedding.weight[padding_idx] = torch.ones(3) >>> embedding.weight Parameter containing: tensor([[ 1.0000, 1.0000, 1.0000], [-0.7895, -0.7089, -0.0364], [ 0.6778, 0.5803, 0.2678]], requires_grad=True) """
__constants__ = ['num_embeddings', 'embedding_dim', 'padding_idx', 'max_norm', 'norm_type', 'scale_grad_by_freq', 'sparse']
num_embeddings: int embedding_dim: int padding_idx: Optional[int] max_norm: Optional[float] norm_type: float scale_grad_by_freq: bool weight: Tensor freeze: bool sparse: bool
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None, max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False, sparse: bool = False, _weight: Optional[Tensor] = None, _freeze: bool = False, device=None, dtype=None) -> None: factory_kwargs = {'device': device, 'dtype': dtype} super().__init__() self.num_embeddings = num_embeddings self.embedding_dim = embedding_dim if padding_idx is not None: if padding_idx > 0: assert padding_idx < self.num_embeddings, 'Padding_idx must be within num_embeddings' elif padding_idx < 0: assert padding_idx >= -self.num_embeddings, 'Padding_idx must be within num_embeddings' padding_idx = self.num_embeddings + padding_idx self.padding_idx = padding_idx self.max_norm = max_norm self.norm_type = norm_type self.scale_grad_by_freq = scale_grad_by_freq if _weight is None: self.weight = Parameter(torch.empty((num_embeddings, embedding_dim), **factory_kwargs), requires_grad=not _freeze) self.reset_parameters() else: assert list(_weight.shape) == [num_embeddings, embedding_dim], \ 'Shape of weight does not match num_embeddings and embedding_dim' self.weight = Parameter(_weight, requires_grad=not _freeze)
self.sparse = sparse
def reset_parameters(self) -> None: init.normal_(self.weight) self._fill_padding_idx_with_zero()
def _fill_padding_idx_with_zero(self) -> None: if self.padding_idx is not None: with torch.no_grad(): self.weight[self.padding_idx].fill_(0)
def forward(self, input: Tensor) -> Tensor: return F.embedding( input, self.weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse)
def extra_repr(self) -> str: s = '{num_embeddings}, {embedding_dim}' if self.padding_idx is not None: s += ', padding_idx={padding_idx}' if self.max_norm is not None: s += ', max_norm={max_norm}' if self.norm_type != 2: s += ', norm_type={norm_type}' if self.scale_grad_by_freq is not False: s += ', scale_grad_by_freq={scale_grad_by_freq}' if self.sparse is not False: s += ', sparse=True' return s.format(**self.__dict__)
@classmethod def from_pretrained(cls, embeddings, freeze=True, padding_idx=None, max_norm=None, norm_type=2., scale_grad_by_freq=False, sparse=False): r"""Create Embedding instance from given 2-dimensional FloatTensor.
Args: embeddings (Tensor): FloatTensor containing weights for the Embedding. First dimension is being passed to Embedding as ``num_embeddings``, second as ``embedding_dim``. freeze (bool, optional): If ``True``, the tensor does not get updated in the learning process. Equivalent to ``embedding.weight.requires_grad = False``. Default: ``True`` padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient; therefore, the embedding vector at :attr:`padding_idx` is not updated during training, i.e. it remains as a fixed "pad". max_norm (float, optional): See module initialization documentation. norm_type (float, optional): See module initialization documentation. Default ``2``. scale_grad_by_freq (bool, optional): See module initialization documentation. Default ``False``. sparse (bool, optional): See module initialization documentation.
Examples::
>>> # FloatTensor containing pretrained weights >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]]) >>> embedding = nn.Embedding.from_pretrained(weight) >>> # Get embeddings for index 1 >>> input = torch.LongTensor([1]) >>> # xdoctest: +IGNORE_WANT("non-deterministic") >>> embedding(input) tensor([[ 4.0000, 5.1000, 6.3000]]) """ assert embeddings.dim() == 2, \ 'Embeddings parameter is expected to be 2-dimensional' rows, cols = embeddings.shape embedding = cls( num_embeddings=rows, embedding_dim=cols, _weight=embeddings, _freeze=freeze, padding_idx=padding_idx, max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq, sparse=sparse) return embedding
|