注:大家觉得博客好的话,别忘了点赞收藏呀~
论文:https://arxiv.org/abs/2012.07436
代码:https://github.com/zhouhaoyi/Informer2020
本文参考:https://zhuanlan.zhihu.com/p/374936725
该数据集每条记录由8个特征组成,每个特征会经过conv1d变为512维向量。如果进行多变量预测任务,则预测为后7列变量的值,如果进行的是单变量预测任务,则预测最后一列变量的值。
将date列的内容编码为时间戳,主要是通过utils中的timeFeatures.py文件实现,主要是进行以下的转换(以freq='h’为例),转化后的4维变量每一维分别代表【月份、日期、星期、小时】:
转换前 | 转换后 |
---|---|
2016-07-01 00:00:00 | [ 7, 1, 4, 0] |
2016-07-01 00:15:00 | [ 7, 1, 4, 1] |
… | … |
2017-06-25 23:00:00 | [ 6,25, 6,23] |
如图所示,数据的embedding由三个部分组成
- Scalar是采用conv1d将1维转换为512维向量
- Local TIme Stamp采用Transformer中的Positional Emebdding
- Gloabal Time Stamp 则是上述处理后的时间戳经过Eemdding
最后,使用三者相加得到最后的输入(shape:[batch_size,seq_len,d_model)
对输入的原始数据进行一个1维卷积得到,将输入数据从Cin=7维映射为d_model=512维。
class TokenEmbedding(nn.Module):
def __init__(self, c_in, d_model):
super(TokenEmbedding, self).__init__()
padding = 1 if torch.__version__>='1.5.0' else 2
self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
kernel_size=3, padding=padding, padding_mode='circular')
for m in self.modules():
if isinstance(m, nn.Conv1d):
nn.init.kaiming_normal_(m.weight,mode='fan_in',nonlinearity='leaky_relu')
def forward(self, x):
x = self.tokenConv(x.permute(0, 2, 1)).transpose(1,2)
return x
2.Position Embedding
和Transformer中的位置编码一样,公式如下
P
E
(
p
o
s
,
2
i
)
=
s
i
n
(
p
o
s
/
1000
0
2
i
/
d
m
o
d
e
l
)
PE_{(pos,2i)} = sin(pos/10000^{2i/dmodel} )
PE(pos,2i)=sin(pos/100002i/dmodel)
P
E
(
p
o
s
,
2
i
+
1
)
=
c
o
s
(
p
o
s
/
1000
0
2
i
/
d
m
o
d
e
l
)
PE_{(pos,2i+1)} = cos(pos/10000^{2i/dmodel} )
PE(pos,2i+1)=cos(pos/100002i/dmodel)
class PositionalEmbedding(nn.Module):
def __init__(self, d_model, max_len=5000):
super(PositionalEmbedding, self).__init__()
# Compute the positional encodings once in log space.
pe = torch.zeros(max_len, d_model).float()
pe.require_grad = False
position = torch.arange(0, max_len).float().unsqueeze(1)
div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
return self.pe[:, :x.size(1)]
3.时间戳编码
对时间戳的编码主要分为TemporalEmbedding
和TimeFeatureEmbedding
这两种方式,前者使用month_embed、day_embed、weekday_embed、hour_embed和minute_embed(可选)多个embedding层处理输入的时间戳,将结果相加;后者直接使用一个全连接层将输入的时间戳映射到512维的embedding。
TemporalEmbedding
中的embedding即可以使用像Position Embedding中的绝对位置编码(embed_type=='fixed')
,也可以使用nn.Embedding让网络训练
class FixedEmbedding(nn.Module):
def __init__(self, c_in, d_model):
super(FixedEmbedding, self).__init__()
w = torch.zeros(c_in, d_model).float()
w.require_grad = False
position = torch.arange(0, c_in).float().unsqueeze(1)
div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
w[:, 0::2] = torch.sin(position * div_term)
w[:, 1::2] = torch.cos(position * div_term)
self.emb = nn.Embedding(c_in, d_model)
self.emb.weight = nn.Parameter(w, requires_grad=False)
def forward(self, x):
return self.emb(x).detach()
class TemporalEmbedding(nn.Module):
def __init__(self, d_model, embed_type='fixed', freq='h'):
super(TemporalEmbedding, self).__init__()
minute_size = 4; hour_size = 24
weekday_size = 7; day_size = 32; month_size = 13
Embed = FixedEmbedding if embed_type=='fixed' else nn.Embedding
if freq=='t':
self.minute_embed = Embed(minute_size, d_model)
self.hour_embed = Embed(hour_size, d_model)
self.weekday_embed = Embed(weekday_size, d_model)
self.day_embed = Embed(day_size, d_model)
self.month_embed = Embed(month_size, d_model)
def forward(self, x):
x = x.long()
minute_x = self.minute_embed(x[:,:,4]) if hasattr(self, 'minute_embed') else 0.
hour_x = self.hour_embed(x[:,:,3])
weekday_x = self.weekday_embed(x[:,:,2])
day_x = self.day_embed(x[:,:,1])
month_x = self.month_embed(x[:,:,0])
return hour_x + weekday_x + day_x + month_x + minute_x
class TimeFeatureEmbedding(nn.Module):
def __init__(self, d_model, embed_type='timeF', freq='h'):
super(TimeFeatureEmbedding, self).__init__()
freq_map = {'h':4, 't':5, 's':6, 'm':1, 'a':1, 'w':2, 'd':3, 'b':3}
d_inp = freq_map[freq]
self.embed = nn.Linear(d_inp, d_model)
def forward(self, x):
return self.embed(x)
最后,使用三者求和得到最终的输入
class DataEmbedding(nn.Module):
def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
super(DataEmbedding, self).__init__()
self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
self.position_embedding = PositionalEmbedding(d_model=d_model)
self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) if embed_type!='timeF' else TimeFeatureEmbedding(d_model=d_model, embed_type=embed_type, freq=freq)
self.dropout = nn.Dropout(p=dropout)
def forward(self, x, x_mark):
x = self.value_embedding(x) + self.position_embedding(x) + self.temporal_embedding(x_mark)
return self.dropout(x)
Model
模型整体仿照Transformer,由Encoder和Decoder组成
AttnInformer contains a 3-layer stack and a 1- layer stack (1/4 input) in the encoder
使用了两种attention,一种是普通的多头自注意力层(FullAttention),一种是Informer新提出来的ProbSparse self-attention层(ProbAttention)。
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from math import sqrt
from utils.masking import TriangularCausalMask, ProbMask
class FullAttention(nn.Module):
def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
super(FullAttention, self).__init__()
self.scale = scale
self.mask_flag = mask_flag
self.output_attention = output_attention
self.dropout = nn.Dropout(attention_dropout)
def forward(self, queries, keys, values, attn_mask):
B, L, H, E = queries.shape
_, S, _, D = values.shape
scale = self.scale or 1./sqrt(E)
scores = torch.einsum("blhe,bshe->bhls", queries, keys)
if self.mask_flag:
if attn_mask is None:
attn_mask = TriangularCausalMask(B, L, device=queries.device)
scores.masked_fill_(attn_mask.mask, -np.inf)
A = self.dropout(torch.softmax(scale * scores, dim=-1))
V = torch.einsum("bhls,bshd->blhd", A, values)
if self.output_attention:
return (V.contiguous(), A)
else:
return (V.contiguous(), None)
Informer模型中提出了一种新的注意力层——ProbSparse Self-Attention。
Q,K,V为输入的embedding分别乘上一个权重矩阵得到的query、key、value。ProbSparse Self-Attention首先对K进行采样,得到K_sample,对每个
q
i
∈
Q
q_{i}\in Q
qi∈Q关于K_sample求M值
找到M值最大的u个
q
i
q_{i}
qi ,对这Top-u个
q
i
q_{i}
qi关于K求score值:
A
(
Q
,
K
,
V
)
=
S
o
f
t
m
a
x
(
Q
‾
K
⊤
d
)
V
A(Q,K,V) = Softmax(\frac{\overline{Q} \mathbf{K}^\top}{\sqrt{d}}) V
A(Q,K,V)=Softmax(d
QK⊤)V
其中 Q ‾ \overline{Q} Q是Top-u的 q i q_{i} qi组成的矩阵,这样就得到了 S 1 S_{1} S1,对于没有被选中的那些 q i q_{i} qi的score值取mean(V)。
在_get_initial_context()
函数中,先将S所有行都置成mean(V),_update_context()
函数将那些Top-u中的行的scoreh值更新为
S
o
f
t
m
a
x
(
Q
‾
K
⊤
d
)
V
Softmax(\frac{\overline{Q} \mathbf{K}^\top}{\sqrt{d}}) V
Softmax(d
QK⊤)V 。
class ProbAttention(nn.Module):
def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
super(ProbAttention, self).__init__()
self.factor = factor
self.scale = scale
self.mask_flag = mask_flag
self.output_attention = output_attention
self.dropout = nn.Dropout(attention_dropout)
def _prob_QK(self, Q, K, sample_k, n_top): # n_top: c*ln(L_q)
# Q [B, H, L, D]
B, H, L_K, E = K.shape
_, _, L_Q, _ = Q.shape
# calculate the sampled Q_K
K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
index_sample = torch.randint(L_K, (L_Q, sample_k)) # real U = U_part(factor*ln(L_k))*L_q
K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze()
# find the Top_k query with sparisty measurement
M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
M_top = M.topk(n_top, sorted=False)[1]
# use the reduced Q to calculate Q_K
Q_reduce = Q[torch.arange(B)[:, None, None],
torch.arange(H)[None, :, None],
M_top, :] # factor*ln
Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1)) # factor*ln(L_q)*L_k
return Q_K, M_top
def _get_initial_context(self, V, L_Q):
B, H, L_V, D = V.shape
if not self.mask_flag:
# V_sum = V.sum(dim=-2)
V_sum = V.mean(dim=-2)
contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone()
else: # use mask
assert(L_Q == L_V) # requires that L_Q == L_V, i.e. for self-attention only
contex = V.cumsum(dim=-2)
return contex
def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
B, H, L_V, D = V.shape
if self.mask_flag:
attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
scores.masked_fill_(attn_mask.mask, -np.inf)
attn = torch.softmax(scores, dim=-1) # nn.Softmax(dim=-1)(scores)
context_in[torch.arange(B)[:, None, None],
torch.arange(H)[None, :, None],
index, :] = torch.matmul(attn, V).type_as(context_in)
if self.output_attention:
attns = (torch.ones([B, H, L_V, L_V])/L_V).type_as(attn).to(attn.device)
attns[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = attn
return (context_in, attns)
else:
return (context_in, None)
def forward(self, queries, keys, values, attn_mask):
B, L_Q, H, D = queries.shape
_, L_K, _, _ = keys.shape
queries = queries.transpose(2,1)
keys = keys.transpose(2,1)
values = values.transpose(2,1)
U_part = self.factor * np.ceil(np.log(L_K)).astype('int').item() # c*ln(L_k)
u = self.factor * np.ceil(np.log(L_Q)).astype('int').item() # c*ln(L_q)
U_part = U_part if U_part<L_K else L_K
u = u if u<L_Q else L_Q
scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u)
# add scale factor
scale = self.scale or 1./sqrt(D)
if scale is not None:
scores_top = scores_top * scale
# get the context
context = self._get_initial_context(values, L_Q)
# update the context with selected top_k queries
context, attn = self._update_context(context, values, scores_top, index, L_Q, attn_mask)
return context.contiguous(), attn
AttentionLayer是定义的attention层,会先将输入的embedding分别通过线性映射得到query、key、value。还将输入维度 d m o d e l d_{model} dmodel 划分为多头,接着就执行前面定义的attention *** 作,最后经过一个线性映射得到输出。
class AttentionLayer(nn.Module):
def __init__(self, attention, d_model, n_heads, d_keys=None,
d_values=None):
super(AttentionLayer, self).__init__()
d_keys = d_keys or (d_model//n_heads)
d_values = d_values or (d_model//n_heads)
self.inner_attention = attention
self.query_projection = nn.Linear(d_model, d_keys * n_heads)
self.key_projection = nn.Linear(d_model, d_keys * n_heads)
self.value_projection = nn.Linear(d_model, d_values * n_heads)
self.out_projection = nn.Linear(d_values * n_heads, d_model)
self.n_heads = n_heads
def forward(self, queries, keys, values, attn_mask):
B, L, _ = queries.shape
_, S, _ = keys.shape
H = self.n_heads
queries = self.query_projection(queries).view(B, L, H, -1)
keys = self.key_projection(keys).view(B, S, H, -1)
values = self.value_projection(values).view(B, S, H, -1)
out, attn = self.inner_attention(
queries,
keys,
values,
attn_mask
)
out = out.view(B, L, -1)
return self.out_projection(out), attn
4. Convlayer
ConvLayer类实现的是Informer中的Distilling *** 作,本质上就是一个1维卷积+ELU激活函数+最大池化。公式如下:
这个 *** 作使得对每个seq_len长度的数据在其长度维度上减半,即distill *** 作
import torch
import torch.nn as nn
import torch.nn.functional as F
class ConvLayer(nn.Module):
def __init__(self, c_in):
super(ConvLayer, self).__init__()
self.downConv = nn.Conv1d(in_channels=c_in,
out_channels=c_in,
kernel_size=3,
padding=2,
padding_mode='circular')
self.norm = nn.BatchNorm1d(c_in)
self.activation = nn.ELU()
self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
def forward(self, x):
x = self.downConv(x.permute(0, 2, 1))
x = self.norm(x)
x = self.activation(x)
x = self.maxPool(x)
x = x.transpose(1,2)
return x
EncoderLayer
EncoderLayer类实现的是一个Encoder层,整体架构和Transformer是大致相同的,主要包含两个子层:多头注意力层(Informer中改为提出的ProbSparse Self-Attention层)和两个线性映射组成的前馈层(Feed Forward),两个子层后都带有一个批量归一化层,子层之间有跳跃连接。
class EncoderLayer(nn.Module):
def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
super(EncoderLayer, self).__init__()
d_ff = d_ff or 4*d_model
self.attention = attention
# 此处的conv1相当于nn.Linear()
self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.activation = F.relu if activation == "relu" else F.gelu
def forward(self, x, attn_mask=None):
# x [B, L, D]
# x = x + self.dropout(self.attention(
# x, x, x,
# attn_mask = attn_mask
# ))
new_x, attn = self.attention(
x, x, x,
attn_mask = attn_mask
)
x = x + self.dropout(new_x)
y = x = self.norm1(x)
y = self.dropout(self.activation(self.conv1(y.transpose(-1,1))))
y = self.dropout(self.conv2(y).transpose(-1,1))
return self.norm2(x+y), attn
Encoder类是将前面定义的Encoder层和Distilling *** 作组织起来,形成一个Encoder模块。其中distilling层总比EncoderLayer少一层,即最后一层EncoderLayer后不再做distilling *** 作。
class Encoder(nn.Module):
def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
super(Encoder, self).__init__()
self.attn_layers = nn.ModuleList(attn_layers)
self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None
self.norm = norm_layer
def forward(self, x, attn_mask=None):
# x [B, L, D]
attns = []
if self.conv_layers is not None:
for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
x, attn = attn_layer(x, attn_mask=attn_mask)
x = conv_layer(x)
attns.append(attn)
x, attn = self.attn_layers[-1](x)
attns.append(attn)
else:
for attn_layer in self.attn_layers:
x, attn = attn_layer(x, attn_mask=attn_mask)
attns.append(attn)
if self.norm is not None:
x = self.norm(x)
return x, attns
为了增强distilling *** 作的鲁棒性,文章中提到可以采用多个replicas并行执行,不同replicas采用不同长度的embedding(L、L/2、L/4、…),embedding长度减半对应的attention层也减少一层,distilling层也会随之减少一层,最终得到的结果拼接起来作为输出。
class EncoderStack(nn.Module):
def __init__(self, encoders, inp_lens):
super(EncoderStack, self).__init__()
self.encoders = nn.ModuleList(encoders)
self.inp_lens = inp_lens
def forward(self, x, attn_mask=None):
# x [B, L, D]
x_stack = []; attns = []
for i_len, encoder in zip(self.inp_lens, self.encoders):
inp_len = x.shape[1]//(2**i_len)
x_s, attn = encoder(x[:, -inp_len:, :])
x_stack.append(x_s); attns.append(attn)
x_stack = torch.cat(x_stack, -2)
return x_stack, attns
Decoder
Decoder部分结构可以参考Transformer中的Decoder结构,包括两层attention层和一个两层线性映射的Feed Forward部分。
需要注意的是,第一个attention层中的query、key、value都是根据Decoder输入的embedding乘上权重矩阵得到的,而第二个attention层中的query是根据前面attention层的输出乘上权重矩阵得到的,key和value是根据Encoder的输出乘上权重矩阵得到的。
import torch
import torch.nn as nn
import torch.nn.functional as F
class DecoderLayer(nn.Module):
def __init__(self, self_attention, cross_attention, d_model, d_ff=None,
dropout=0.1, activation="relu"):
super(DecoderLayer, self).__init__()
d_ff = d_ff or 4*d_model
self.self_attention = self_attention
self.cross_attention = cross_attention
self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.activation = F.relu if activation == "relu" else F.gelu
def forward(self, x, cross, x_mask=None, cross_mask=None):
x = x + self.dropout(self.self_attention(
x, x, x,
attn_mask=x_mask
)[0])
x = self.norm1(x)
x = x + self.dropout(self.cross_attention(
x, cross, cross,
attn_mask=cross_mask
)[0])
y = x = self.norm2(x)
y = self.dropout(self.activation(self.conv1(y.transpose(-1,1))))
y = self.dropout(self.conv2(y).transpose(-1,1))
return self.norm3(x+y)
class Decoder(nn.Module):
def __init__(self, layers, norm_layer=None):
super(Decoder, self).__init__()
self.layers = nn.ModuleList(layers)
self.norm = norm_layer
def forward(self, x, cross, x_mask=None, cross_mask=None):
for layer in self.layers:
x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)
if self.norm is not None:
x = self.norm(x)
return x
Model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
from utils.masking import TriangularCausalMask, ProbMask
from models.encoder import Encoder, EncoderLayer, ConvLayer, EncoderStack
from models.decoder import Decoder, DecoderLayer
from models.attn import FullAttention, ProbAttention, AttentionLayer
from models.embed import DataEmbedding
class Informer(nn.Module):
def __init__(self, enc_in, dec_in, c_out, seq_len, label_len, out_len,
factor=5, d_model=512, n_heads=8, e_layers=3, d_layers=2, d_ff=512,
dropout=0.0, attn='prob', embed='fixed', freq='h', activation='gelu',
output_attention = False, distil=True,
device=torch.device('cuda:0')):
super(Informer, self).__init__()
self.pred_len = out_len
self.attn = attn
self.output_attention = output_attention
# Encoding
self.enc_embedding = DataEmbedding(enc_in, d_model, embed, freq, dropout)
self.dec_embedding = DataEmbedding(dec_in, d_model, embed, freq, dropout)
# Attention
Attn = ProbAttention if attn=='prob' else FullAttention
# Encoder
self.encoder = Encoder(
[
EncoderLayer(
AttentionLayer(Attn(False, factor, attention_dropout=dropout, output_attention=output_attention),
d_model, n_heads),
d_model,
d_ff,
dropout=dropout,
activation=activation
) for l in range(e_layers)
],
[
ConvLayer(
d_model
) for l in range(e_layers-1)
] if distil else None,
norm_layer=torch.nn.LayerNorm(d_model)
)
# Decoder
self.decoder = Decoder(
[
DecoderLayer(
AttentionLayer(Attn(True, factor, attention_dropout=dropout, output_attention=False),
d_model, n_heads),
AttentionLayer(FullAttention(False, factor, attention_dropout=dropout, output_attention=False),
d_model, n_heads),
d_model,
d_ff,
dropout=dropout,
activation=activation,
)
for l in range(d_layers)
],
norm_layer=torch.nn.LayerNorm(d_model)
)
# self.end_conv1 = nn.Conv1d(in_channels=label_len+out_len, out_channels=out_len, kernel_size=1, bias=True)
# self.end_conv2 = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=1, bias=True)
self.projection = nn.Linear(d_model, c_out, bias=True)
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec,
enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None):
enc_out = self.enc_embedding(x_enc, x_mark_enc)
enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask)
dec_out = self.dec_embedding(x_dec, x_mark_dec)
dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask)
dec_out = self.projection(dec_out)
# dec_out = self.end_conv1(dec_out)
# dec_out = self.end_conv2(dec_out.transpose(2,1)).transpose(1,2)
if self.output_attention:
return dec_out[:,-self.pred_len:,:], attns
else:
return dec_out[:,-self.pred_len:,:] # [B, L, D]
InformerStack是在Encoder部分使用多个replicas的模型。
class InformerStack(nn.Module):
def __init__(self, enc_in, dec_in, c_out, seq_len, label_len, out_len,
factor=5, d_model=512, n_heads=8, e_layers=[3,2,1], d_layers=2, d_ff=512,
dropout=0.0, attn='prob', embed='fixed', freq='h', activation='gelu',
output_attention = False, distil=True,
device=torch.device('cuda:0')):
super(InformerStack, self).__init__()
self.pred_len = out_len
self.attn = attn
self.output_attention = output_attention
# Encoding
self.enc_embedding = DataEmbedding(enc_in, d_model, embed, freq, dropout)
self.dec_embedding = DataEmbedding(dec_in, d_model, embed, freq, dropout)
# Attention
Attn = ProbAttention if attn=='prob' else FullAttention
# Encoder
inp_lens = list(range(len(e_layers))) # [0,1,2,...] you can customize here
encoders = [
Encoder(
[
EncoderLayer(
AttentionLayer(Attn(False, factor, attention_dropout=dropout, output_attention=output_attention),
d_model, n_heads),
d_model,
d_ff,
dropout=dropout,
activation=activation
) for l in range(el)
],
[
ConvLayer(
d_model
) for l in range(el-1)
] if distil else None,
norm_layer=torch.nn.LayerNorm(d_model)
) for el in e_layers]
self.encoder = EncoderStack(encoders, inp_lens)
# Decoder
self.decoder = Decoder(
[
DecoderLayer(
AttentionLayer(Attn(True, factor, attention_dropout=dropout, output_attention=False),
d_model, n_heads),
AttentionLayer(FullAttention(False, factor, attention_dropout=dropout, output_attention=False),
d_model, n_heads),
d_model,
d_ff,
dropout=dropout,
activation=activation,
)
for l in range(d_layers)
],
norm_layer=torch.nn.LayerNorm(d_model)
)
# self.end_conv1 = nn.Conv1d(in_channels=label_len+out_len, out_channels=out_len, kernel_size=1, bias=True)
# self.end_conv2 = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=1, bias=True)
self.projection = nn.Linear(d_model, c_out, bias=True)
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec,
enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None):
enc_out = self.enc_embedding(x_enc, x_mark_enc)
enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask)
dec_out = self.dec_embedding(x_dec, x_mark_dec)
dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask)
dec_out = self.projection(dec_out)
# dec_out = self.end_conv1(dec_out)
# dec_out = self.end_conv2(dec_out.transpose(2,1)).transpose(1,2)
if self.output_attention:
return dec_out[:,-self.pred_len:,:], attns
else:
return dec_out[:,-self.pred_len:,:] # [B, L, D]
7. masking.py
mask机制用在Decoder的第一个attention层中,目的是为了保证t时刻解码的输出只依赖于t时刻之前的输出。
生成的mask矩阵右上角部分为1(不包括对角线),将mask矩阵作用到score矩阵上会使得mask矩阵中为1的位置在score矩阵中为 [公式] ,这样softmax后就为0。
TriangularCausalMask是用在Fullattention层上的,ProbMask是用在ProbSparseAttention层上的。
import torch
class TriangularCausalMask():
def __init__(self, B, L, device="cpu"):
mask_shape = [B, 1, L, L]
with torch.no_grad():
self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)
@property
def mask(self):
return self._mask
class ProbMask():
def __init__(self, B, H, L, index, scores, device="cpu"):
_mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
_mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
indicator = _mask_ex[torch.arange(B)[:, None, None],
torch.arange(H)[None, :, None],
index, :].to(device)
self._mask = indicator.view(scores.shape).to(device)
@property
def mask(self):
return self._mask
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)