• Ataloglou，D.，Dimou，A.，Zarpalas，D.，Daras，P.Neuroinform（2019）。 它提供了在MRI数据集上评估所提出的方法所必需的代码和经过训练的模型，该数据集是研究中使用的两个公共数据集，还是新的。 此处提供的...
• B_CNN ** 再经过sum-pool 或者maxpool，后加分类模块。 fA：CxM fB: CxN bilinear = MxN -----> reshape to size MN x 1 -----> classification 双线性CNN的梯度传播 实验 本文采用 M（15 layers）conv5+...
**
B_CNN
**

再经过sum-pool 或者maxpool，后加分类模块。
fA：CxM    fB: CxN
bilinear = MxN  -----> reshape to size MN x 1 ----->  classification
双线性CNN的梯度传播

实验
本文采用 M（15 layers）conv5+relu ，D（30 layers）conv5-4 + relu。

Low dimensional bilinear CNN models
对称初始化的b-cnn在参数调整后还是对称的，因为梯度是相同的。虽然有效率，但是次优的，没有探索到不同的CNN空间。
为打破特征提取器的对称性：

dropout效果稍微有所下降
降维度，降低通道数。


展开全文
• <p>also noted that we are using <code>conv1d</code> but in reality we should be using <code>conv2d</code> <a href="https://github.com/castorini/Castor/blob/master/sm_cnn/model.py#L39">here</a> since ...
• ## 神经网络 BP_CNN_RNN

千次阅读 2018-12-20 18:30:10
import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data import numpy as np import os  os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  mnist = input_data.read_data_...MNIST_d...

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

Weights=tf.Variable(tf.random.normal([in_size,out_size]))
biases=tf.Variable(tf.zeros([1,out_size])+0.1)
Wx_plus_b=tf.matmul(inputs,Weights)+biases
if activation_function is None:
outputs=Wx_plus_b
else:
outputs=activation_function(Wx_plus_b)
return outputs

xs = tf.placeholder(tf.float32,[None,784]) #28*28
ys = tf.placeholder(tf.float32,[None,10]) #

cross_entropy = tf.reduce_mean ( -tf.reduce_sum( ys*tf.log(prediction),reduction_indices=[1] ) )

# init = tf.initialize_all_variables()
init=tf.global_variables_initializer()
sess=tf.Session()
sess.run(init)

def computer_accuracy(v_xs,v_ys):
global prediction
y_pre=sess.run(prediction,feed_dict={xs:v_xs})
correct_prediction=tf.equal(tf.argmax(y_pre,1),tf.argmax(v_ys,1))
accuracy=tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
result=sess.run(accuracy,feed_dict={xs:v_xs,ys:v_ys})
return result

for i in range(1000):
batch_xs,batch_ys=mnist.train.next_batch(100)
# batch_xs=tf.reshape(batch_xs, [-1,28*28])
sess.run(train_step,feed_dict={xs:batch_xs,ys:batch_ys})
if i%50==0:
print(computer_accuracy(mnist.test.images,mnist.test.labels))

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# number 1 to 10 data

######################### graph size ##########################################
graphw=28
graphh=28
classes=10
######################### graph size ##########################################

def compute_accuracy(v_xs, v_ys):
global prediction
y_pre = sess.run(prediction, feed_dict={xs: v_xs, keep_prob: 1})
correct_prediction = tf.equal(tf.argmax(y_pre,1), tf.argmax(v_ys,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
result = sess.run(accuracy, feed_dict={xs: v_xs, ys: v_ys, keep_prob: 1})
return result

def weight_variable(shape): #随机权重
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)

def bias_variable(shape):   #随机参数b
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)

def conv2d(x, W): #卷积操作
# stride [1, x_movement, y_movement, 1]
# Must have strides[0] = strides[3] = 1
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x): #Max Pooling 比较常用
# stride [1, x_movement, y_movement, 1]

def convlayer(x_image,patch_size,conv_in,conv_out):
# patch 5x5, in size 1, out size 8 #添加了 5*5 的 32 个卷积核 so in=1，out=8
W_conv = weight_variable([patch_size,patch_size,conv_in,conv_out])
b_conv = bias_variable([conv_out])
h_conv = tf.nn.relu(conv2d(x_image, W_conv) + b_conv)
h_pool = max_pool_2x2(h_conv)
return h_pool

def add_layer(inputs, in_size, out_size, layer_name, activation_function=None, ):
W_fc = weight_variable([in_size,out_size])
b_fc = bias_variable([out_size])
h_fc = tf.matmul(inputs,W_fc)+b_fc
h_fc_drop = tf.nn.dropout(h_fc, keep_prob)
if activation_function is None:
outputs = h_fc_drop
else:
outputs = activation_function(h_fc_drop, )
return outputs

# define placeholder for inputs to network
xs = tf.placeholder(tf.float32, [None, graphw*graphh])
ys = tf.placeholder(tf.float32, [None, classes])
keep_prob = tf.placeholder(tf.float32)

x_image = tf.reshape(xs, [-1, graphh, graphw, 1])
######################### conv1 layer ##########################################
patch_size=5   #卷积核size
conv1_in=1     #default=1
conv1_out=8    #8times                           # os1=graphh*graphw*8times
h_pool1=convlayer(x_image,patch_size,conv1_in,conv1_out) # os2=os1/4
######################### conv1 layer ##########################################

######################### conv2 layer ##########################################
patch_size=5
conv2_in=8
conv2_out=16 #2times                             # os3=os2*2times
h_pool2=convlayer(h_pool1,patch_size,conv2_in,conv2_out) # os4=os3/4
######################### conv2 layer ##########################################

######################### func1 layer ##########################################
bp_in_size=7*7*16
bp_out_size=128
bp_in=tf.reshape(h_pool2, [-1,bp_in_size]) #请根据数据集 自行计算
######################### func1 layer ##########################################

######################### func2 layer ##########################################
bp_in_size=bp_out_size
bp_out_size=classes
######################### func2 layer ##########################################

cross_entropy = tf.reduce_mean(-tf.reduce_sum(ys * tf.log(prediction),reduction_indices=[1])) # loss

sess = tf.Session()
sess.run(tf.global_variables_initializer())

for i in range(1000):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={xs: batch_xs, ys: batch_ys, keep_prob: 0.5})
if i % 50 == 0:
print(compute_accuracy(mnist.test.images, mnist.test.labels))

#coding:utf-8
import  tensorflow as tf
from  tensorflow.examples.tutorials.mnist import  input_data
from tensorflow.contrib import  rnn

######################### conv1 layer ##########################################
lr=0.001                #学习率 反向传播
training_iters=50000    #迭代次数 正向传播
batch_size=100
test_batch=100

n_inputs=28             # 每个序列X的大小
n_steps=28              # 序列长度
n_hidden_unis=128
n_classes=10            # Y 分类数目
######################### conv1 layer ##########################################

#定义输入,输出
x=tf.placeholder(dtype=tf.float32,shape=[None,n_steps,n_inputs],name="inputx")
y=tf.placeholder(dtype=tf.float32,shape=[None,n_classes],name="expected_y")
keep_prob = tf.placeholder(tf.float32)

#定义权值
weights={
'in':tf.Variable(tf.truncated_normal([n_inputs,n_hidden_unis])),
'out':tf.Variable(tf.truncated_normal([n_hidden_unis,n_classes]))
}
biases={
'in':tf.Variable(tf.constant(0.1,shape=[n_hidden_unis,])),
'out':tf.Variable(tf.constant(0.1,shape=[n_classes,]))
}

def compute_accuracy(v_xs, v_ys):
global pred
v_xs=v_xs.reshape([test_batch,n_steps,n_inputs])
y_pre = sess.run(pred, feed_dict={x: v_xs, keep_prob: 1})
correct_prediction = tf.equal(tf.argmax(y_pre,1), tf.argmax(v_ys,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
result = sess.run(accuracy, feed_dict={x: v_xs, y: v_ys, keep_prob: 1})
return result

def RNN(X,weights,biases):
#hidden layer for input to cell
#############################################################################################
#X(128 batch,28 steps, 28 inputs)
X=tf.reshape(X,[-1,n_inputs])
#X_in(128 batch*28steps, 128 hidden)
X_in=tf.matmul(X,weights['in'])+biases['in']
#X_in(128 batch , 28steps, 128 hidden)
X_in=tf.reshape(X_in,[-1,n_steps,n_hidden_unis])
#############################################################################################
lstm_cell=tf.nn.rnn_cell.BasicLSTMCell(n_hidden_unis,forget_bias=1.0,state_is_tuple=True)
_init_state=lstm_cell.zero_state(batch_size,dtype=tf.float32)
output,states=tf.nn.dynamic_rnn(lstm_cell,X_in,initial_state=_init_state,time_major=False)
#############################################################################################
results=tf.matmul(states[1],weights['out'])+biases['out']
return results

pred=RNN(x,weights,biases)

cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred,labels=y))

sess=tf.Session()
sess.run(tf.initialize_all_variables())

step=0
testx,testy=mnist.test.next_batch(test_batch)
while step*batch_size<training_iters:
batch_xs,batch_ys=mnist.train.next_batch(batch_size)
batch_xs=batch_xs.reshape([batch_size,n_steps,n_inputs])
sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})
if step % 50 ==0:
print(compute_accuracy(testx, testy))
step+=1


展开全文
• ve successfully trained BidLSTM_CNN model and obtained the <code>model_weights.hdf5</code> file. <p>When I run: <code>python nerTagger.py --dataset-type conll2003 --architecture BidLSTM_CNN eval...
• The outfielder[ˈaʊtfi:ldə(r)]外场手 immediately starts running, anticipating the ball’s trajectory[trəˈdʒektəri]轨线. He tracks it, adapts his movements, and finally catches it (under a ...
     The batter[ˈbætər]击球员 hits the ball. The outfielder[ˈaʊtfi:ldə(r)]外场手 immediately starts running, anticipating the ball’s trajectory[trəˈdʒektəri]轨线. He tracks it, adapts his movements, and finally catches it (under a thunder of applause). Predicting the future is something you do all the time, whether you are finishing a friend’s sentence or anticipating the smell of coffee at breakfast. In this chapter we will discuss recurrent neural networks (RNNs), a class of nets that can predict the future (well, up to a point, of course). They can analyze time series data such as stock prices, and tell you when to buy or sell. In autonomous driving systems, they can anticipate car trajectories and help avoid accidents. More generally, they can work on sequences of arbitrary lengths, rather than on fixed-sized inputs like all the nets we have considered so far. For example, they can take sentences, documents, or audio samples as input, making them extremely useful for natural language processing(NLP) applications such as automatic translation or speech-to-text, or sentiment analysis (e.g., reading movie reviews and extracting the rater’s feeling about the movie).

In this chapter we will first look at the fundamental concepts underlying RNNs and how to train them using backpropagation through time, then we will use them to
forecast a time series. After that we’ll explore the two main difficulties that RNNs face:

• Unstable gradients (discussed in Chapter 11, e.g. vanishing/exploding gradients), which can be alleviated using various techniques, including recurrent dropout and recurrent layer normalization
• A (very) limited short-term memory, which can be extended using LSTM and GRU cells
RNNs are not the only types of neural networks capable of handling sequential data: for small sequences, a regular dense network can do the trick; and for very long
sequences, such as audio samples or text, convolutional neural networks can actually work quite well too. We will discuss both of these possibilities, and we will finish this
chapter by implementing a WaveNet: this is a CNN architecture capable of handling sequences of tens of thousands of time steps. In Chapter 16, we will continue to
explore RNNs and see how to use them for natural language processing(NLP), along with more recent architectures based on attention mechanisms. Let’s get started!

Recurrent Neurons and Layers

Up to now we have focused on feedforward neural networks, where the activations flow only in one direction, from the input layer to the output layer (a few exceptions
are discussed in Appendix E). A recurrent neural network looks very much like a feedforward neural network, except it also has connections pointing backward. Let’s
look at the simplest possible RNN, composed of one neuron receiving inputs, producing an output, and sending that output back to itself, as shown in Figure 15-1
(left). At each time step t (also called a frame帧), this recurrent neuron receives the inputs  as well as its own output from the previous time step, . Since there is no previous output at the first time step, it is generally set to 0. We can represent this tiny network against the time axis, as shown in Figure 15-1 (right). This is called unrolling the network through time随着时间的推移展开网络 (it’s the same recurrent neuron represented once per time step). Figure 15-1. A recurrent neuron (left) unrolled through time (right)

You can easily create a layer of recurrent neurons. At each time step t, every neuron
(###
If the input  received by the recurrent neuron is a vector, then the recurrent neuron should be a neuron containing multiple units.
If the input  received by the recurrent neuron is a scalar, then the recurrent neuron is just a neuron of one unit.
(###
At time step t, the input instance contains multiple features, and each neuron receives multiple features
###)
and the output vector from the previous time step , as shown in Figure 15-2. Note that both the inputs and outputs are vectors now (when there was just a single neuron, the output was a scalar. A scalar is a single number, usually represented by a lowercase variable name).Figure 15-2. A layer of recurrent neurons (left) unrolled through time (right)

Each recurrent neuron has two sets of weights: one for the inputs  and the other for the outputs of the previous time step, . Let’s call these weight vectors  and . If we consider the whole recurrent layer instead of just one recurrent neuron, we can place all the weight vectors in two weight matrices,
(###
shape = [num_input_instances=len(recurrent output neurons), num_features_for_each_instance=len(weight vectors) ]
###)

and . The output vector of the whole recurrent layer can then be computed pretty much as you might expect, as shown in Equation 15-1 ( is the bias vector and ϕ(·) is the activation function (e.g., ReLU).

Equation 15-1. Output of a recurrent layer for a single instance  ### note: contains just one instance with multiple features per time step t and  is just one item ###

Just as with feedforward neural networks, we can compute a recurrent layer’s output in one shot for a whole mini-batch by placing all the inputs at time step t in an input matrix  (see Equation 15-2).

Equation 15-2. Outputs of a layer of recurrent neurons for all instances in a minibatch### note: contains multiple instances with multiple features per time step t ###
OR

In this equation:

•  is an  matrix containing the layer’s outputs at time step t for each instance in the mini-batch (m is the number of instances in the mini-batch and  is the number output of neurons).
•  is an  matrix containing the inputs for all instances (is the number of input features).
•  is an  matrix containing the connection weights for the inputs of the current time step.
•  is an  matrix containing the connection weights for the outputs of the previous time step.
•  is a vector of size  containing each output neuron’s bias term.
• The weight matrices  and  are often concatenated vertically into a single weight matrix  of shape  (see the second line of Equation 15-2).
• The notation  represents the horizontal concatenation of the matrices  and .
Notice that  is a function of  and , which is a function of  and , which is a function of  and , and so on. This makes  a function of all the inputs since time t = 0 (that is, ). At the first time step, t = 0, there are no previous outputs, so they are typically assumed to be all zeros.
######################################### https://blog.csdn.net/Linli522362242/article/details/113846940

• The weight matrices  and  are often concatenated horizontally into a single weight matrix  of shape  (see the second line of Equation 15-2).
• The notation  represents the vertical concatenation of the matrices  and .
==> , ==>, ==>
the same weights are used at every time step
#########################################

Memory Cells

Since the output of a recurrent neuron at time step t is a function of all the inputs from previous time steps, you could say it has a form of memory. A part of a neural network that preserves some state across time steps is called a memory cell (or simply a cell). A single recurrent neuron, or a layer of recurrent neurons, is a very basic cell, capable of learning only short patterns (typically about 10 steps long, but this varies depending on the task). Later in this chapter, we will look at some more complex and powerful types of cells capable of learning longer patterns (roughly 10 times longer, but again, this depends on the task).

In general a cell’s state at time step t, denoted  (the “h” stands for “hidden”), is a function of some inputs at that time step and its state at the previous time step:  = . Its output at time step t, denoted , is also a function of the previous state and the current inputs. In the case of the basic cells we have discussed so far, the output is simply equal to the state, but in more complex cells this is not always the case, as shown in Figure 15-3. Figure 15-3. A cell’s hidden state and its output may be different

Input and Output Sequences

An RNN can simultaneously take a sequence of inputs and produce a sequence of outputs (see the top-left network in Figure 15-4). This type of sequence-to-sequence network（### Many-to-many: Both the input and output arrays are sequences. This category can be further divided based on whether the input and output are synchronize['sɪŋkrənaɪzd]同步的.

An example of a synchronized many-to-many modeling task is video classification, where each frame in a video is labeled.
An example of a delayed many-to-many modeling task would be translating one language into another. For instance, an entire English sentence must be read and processed by a machine before its translation into German is produced.
###） is useful for predicting time series such as stock prices: you feed it the prices over the last N days, and it must output the prices shifted by one day into the future (i.e., from N – 1 days ago to tomorrow). https://blog.csdn.net/Linli522362242/article/details/113846940VSFigure 15-4. Seq-to-seq (top left), seq-to-vector (top right), vector-to-seq (bottom left), and Encoder–Decoder (bottom right) networks

Alternatively, you could feed the network a sequence of inputs and ignore all outputs except for the last one (see the top-right network in Figure 15-4). In other words, this is a sequence-to-vector network（### Many-to-one: The input data is a sequence, but the output is a fixed-size vector or scalar, not a sequence. For example, in sentiment analysis, the input is text-based (for example, a movie review) and the output is a class label (for example, a label denoting whether a reviewer liked the movie).###）. For example, you could feed the network a sequence of words corresponding to a movie review, and the network would output a sentiment score (e.g., from –1 [hate] to +1 [love]).

Conversely, you could feed the network the same input vector over and over again at each time step and let it output a sequence (see the bottom-left network of Figure 15-4 （### One-to-many: The input data is in standard format and not a sequence, but the output is a sequence. An example of this category is image captioning—the input is an image and the output is an English phrase summarizing the content of that image)###）. This is a vector-to-sequence network. For example, the input could be an image (or the output of a CNN), and the output could be a caption for that image.

Lastly, you could have a sequence-to-vector network, called an encoder, followed by a vector-to-sequence network, called a decoder (see the bottom-right network of Figure 15-4). For example, this could be used for translating a sentence from one language to another. You would feed the network a sentence in one language, the encoder would convert this sentence into a single vector representation, and then the decoder would decode this vector into a sentence in another language. This two-step model, called an Encoder–Decoder, works much better than trying to translate on the fly with a single sequence-to-sequence RNN (like the one represented at the top left): the last words of a sentence can affect the first words of the translation, so you need to wait until you have seen the whole sentence before translating it. We will see how to implement an Encoder–Decoder in Chapter 16 (as we will see, it is a bit more complex than in Figure 15-4 suggests).

Sounds promising, but how do you train a recurrent neural network?

Training RNNs

To train an RNN, the trick is to unroll it through time (like we just did) and then simply use regular backpropagation (see Figure 15-5). This strategy is called backpropagation through time (BPTT).Figure 15-5. Backpropagation through time

Just like in regular backpropagation,

there is a first forward pass through the unrolled network (represented by the dashed arrows).
Then the output sequence is evaluated using a cost function  (where T is the max time step). Note that this cost function may ignore some outputs, as shown in Figure 15-5 (for example, in a sequence-to-vector RNN, all outputs are ignored except for the very last one).
The gradients of that cost function are then propagated backward through the unrolled network (represented by the solid arrows).
Finally the model parameters are updated using the gradients computed during BPTT.
Note that the gradients flow backward through all the outputs used by the cost function, not just through the final output (for example, in Figure 15-5 the cost function is computed using the last three outputs of the network, , and , so gradients flow through these three outputs, but not through  and ). Moreover, since the same parameters W and b are used at each time step, backpropagation will do the right thing and sum over all time steps.

Fortunately, tf.keras takes care of all of this complexity for you—so let’s start coding!

Forecasting a Time Series

Suppose you are studying the number of active users per hour on your website, or the daily temperature in your city, or your company’s financial health, measured quarterly using multiple metrics. In all these cases, the data will be a sequence of one or more values per time step. This is called a time series. In the first two examples there is a single value per time step, so these are univariate time series, while in the financial example there are multiple values per time step (e.g., the company’s revenue, debt, and so on), so it is a multivariate time series. A typical task is to predict future values, which is called forecasting. Another common task is to fill in the blanks: to predict (or rather “postdict”) missing values from the past. This is called imputation. For example, Figure 15-6 shows 3 univariate time series, each of them 50 time steps long, and the goal here is to forecast the value at the next time step (represented by the X) for each of them.Figure 15-6. Time series forecasting

For simplicity, we are using a time series generated by the generate_time_series() function, shown here:

import numpy as np

def generate_time_series(batch_size, n_steps):
freq1, freq2, offsets1, offsets2 = np.random.rand(4, batch_size, 1)
# print(freq1.shape) # (batches,1)
time = np.linspace(0,1, n_steps)
# print(time.shape)  # (n_steps,)

# time-offsets1 shape: (n_steps,)-(batches,1)
# the axes operation is from right to left :
#                         time.shape(n_steps,)         - freq1.shape(batches,1)
#                      ==>time.shape(,n_steps)         - freq1.shape(batches,1)
# broadcast operation  ==>time.shape(batches, n_steps) along row(batches)
#                         - offsets1.shape(batches,n_steps) along column(,n_steps)
#                       result.shape(batches,n_steps)
series = 0.5*np.sin( (time-offsets1)*(freq1*10 + 10) )  # wave 1 in the 1st row
#print(series.shape) # (batches, n_steps+1)
series += 0.2*np.sin( (time-offsets2)*(freq2*20 + 20) ) # +wave 2 in the 2nd row
# print(series.shape) # (batches, n_steps)
series += 0.1*(np.random.rand(batch_size, n_steps)-0.5) # +noise in the 3rd row
# print(series.shape) # (batches, n_steps)

return series[..., np.newaxis].astype( np.float32 )

This function creates as many time series as requested (via the batch_size argument), each of length n_steps, and there is just one value per time step in each series (i.e., all series are univariate). The function returns a NumPy array of shape [batch size, time steps, 1], where each series is the sum of two sine waves of fixed amplitudes [0.5,0.2] but random frequencies and phases ### [ (time-offsets1)*10,  (time-offsets2)*20 ] ###, plus a bit of noise.

####################################################
When dealing with time series (and other types of sequences such as sentences), the input features are generally represented as 3D arrays of shape [batch size, time steps, dimensionality], where dimensionality is 1 for univariate time series and more for multivariate time series.
####################################################

Now let’s create a training set, a validation set, and a test set using this function:

np.random.seed(42)

n_steps = 50
series = generate_time_series(10000, n_steps+1)
# exclude [:7000, n_steps]
X_train, y_train = series[:7000, :n_steps], series[:7000, -1]
X_valid, y_valid = series[7000:9000, :n_steps], series[7000:9000, -1]
X_test, y_test = series[9000:, :n_steps], series[9000:, :-1]

X_train.shape, y_train.shape

X_train contains 7,000 time series (i.e., its shape is [7000, 50, 1]), while X_valid contains 2,000 (from the 7,000th time series to the 8,999th) and X_test contains 1,000 (from the 9,000th to the 9,999th). Since we want to forecast a single value for each series, the targets are column vectors (e.g., y_train has a shape of [7000, 1]).

import matplotlib.pyplot as plt
def plot_series( series,
y=None,
y_pred=None,
x_label="$t$", y_label="$x(t)$"
):
plt.plot( series, ".-")#############index=>x, values=>y
if y is not None:
plt.plot( n_steps, y, "bx", markersize=10)
if y_pred is not None:
plt.plot( n_steps, y_pred, "ro", markersize=10)

if x_label:
plt.xlabel(x_label, fontsize=16)
if y_label:

plt.hlines(0, 0, n_steps+1, linewidth=1)
plt.axis([0, n_steps+1, -1, 1])
plt.grid(True)

fig, axes = plt.subplots( nrows=1, ncols=3, sharey=True, figsize=(18,5) )
for col in range(3): #use first 3 series
plt.sca( axes[col] ) # Set the Current Axes instance to ax.
plot_series( X_valid[col, :, 0], y_valid[col, 0],
y_label="$x(t)$" #if col==0 else None
)

plt.show()

Baseline Metrics

Before we start using RNNs, it is often a good idea to have a few baseline metrics, or else we may end up thinking our model works great when in fact it is doing worse than basic models. For example, the simplest approach is to predict the last value in each series. This is called naive forecasting, and it is sometimes surprisingly difficult to outperform. In this case, it gives us a mean squared error of about 0.020:

from tensorflow import keras

# X_valid = series[7000:9000, :n_steps] # n_steps==50
# y_valid = series[7000:9000, -1] # series[7000:9000, 50]
y_pred = X_valid[:,-1]            # series[7000:9000, 49]

np.mean( keras.losses.mean_squared_error(y_valid, y_pred) )

plot_series( X_valid[0, :, 0], y_valid[0,0], y_pred[0,0] )
plt.show()

Another simple approach is to use a fully connected network. Since it expects a flat list of features for each input, we need to add a Flatten layer. Let’s just use a simple Linear Regression model so that each prediction will be a linear combination of the values in the time series:

import tensorflow as tf

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
keras.layers.Flatten( input_shape=[50,1] ),
keras.layers.Dense(1)
])

history = model.fit( X_train, y_train, epochs=20,
validation_data=(X_valid, y_valid) )

If we compile this model using the MSE loss and the default Adam optimizer, then fit it on the training set for 20 epochs and evaluate it on the validation set, we get an MSE of about 0.004. That’s much better than the naive approach()!

model.evaluate( X_valid, y_valid )

the validation error is computed at the end of each epoch, while the training error is computed using a running mean during each epoch. So the training curve should be shifted by half an epoch to the left. 10_Introduction to Artificial Neural Networks w Keras_3_FashionMNIST_pydot_sparse_shift(0.)_plt_imgs_learing curve : https://blog.csdn.net/Linli522362242/article/details/106562190

import matplotlib as mpl

def plot_learning_curves(loss, val_loss):
# the validation error is computed at the end of each epoch,
# while the training error is computed using a running mean during each epoch.
# So the training curve should be shifted by half an epoch to the left.
plt.plot( np.arange( len(loss) )-0.5, loss, "b.-", label="Training loss" )
plt.plot( np.arange( len(val_loss) ), val_loss, "r.-", label="Validation loss" )

plt.axis([1, 20, # epoch is from 1 to 20
0, 0.05])
# integer : bool, default: False
# If True, ticks will take only integer values, provided at least min_n_ticks integers
# are found within the view limits.
plt.gca().xaxis.set_major_locator( mpl.ticker.MaxNLocator(integer=True) )
plt.legend( fontsize=14 )
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.grid(True)

plot_learning_curves( history.history["loss"], history.history["val_loss"] )

plt.show()

<==mpl.ticker.MaxNLocator(integer=True)<==
For demostration, let's move the above curves to right by both plus 1 #(len(train loss-0.5+1)=len(loss)+0.5 and len(val_loss)+1):

import matplotlib as mpl

def plot_learning_curves(loss, val_loss): # both loss and val_loss are a list
# the validation error is computed at the end of each epoch,
# while the training error is computed using a running mean during each epoch.
# So the training curve should be shifted by half an epoch to the left.
plt.plot( np.arange( len(loss) )+0.5, loss, "b.-", label="Training loss" )
plt.plot( np.arange( len(val_loss) )+1, val_loss, "r.-", label="Validation loss" )

plt.axis([0, 20, 0, 0.05])
plt.gca().xaxis.set_major_locator( mpl.ticker.MaxNLocator(integer=True) )
plt.legend( fontsize=14 )
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.grid(True)

plot_learning_curves( history.history["loss"], history.history["val_loss"] )

plt.show()

 # prediction on all data in X_valid but only just focus on [50]
y_pred = model.predict( X_valid )
# first 0: first series,
# second 0: the value is saved in third dimension(1 for univariate time series)
# X_valid.shape = (2000, 50, 1), 2000 series, 50 time steps, 1 for univariate
plot_series( X_valid[0, :, 0], y_valid[0,0], y_pred[0,0] )

Implementing a Simple RNN

Let’s see if we can beat that with a simple RNN:
######################################
Using the TensorFlow Keras API, a recurrent layer can be defined via SimpleRNN, which is similar to the output-to-output recurrence.https://blog.csdn.net/Linli522362242/article/details/113846940


# manually computing the output:
out_to_out = []
for t in range( len(x_seq) ):
xt = tf.reshape( x_seq[t], (1,5) )
print( "Time step {} =>".format(t) )
print( '   Input           :', xt.numpy() )

ht = tf.matmul(xt, w_xh) + b_h         ###########
print('   Hidden          :', ht.numpy())

if t>0:
prev_output = out_to_out[t-1]
else:
prev_output = tf.zeros(shape=(ht.shape))

ot = ht + tf.matmul(prev_output, w_oo) ###########
ot = tf.math.tanh(ot) # since the activation in SimpleRNN is 'tanh'
out_to_out.append(ot)

print('   Output (manual) :', ot.numpy())
print('   SimpleRNN output: '.format(t),
output[0][t].numpy())
print()

###################################### since adam : default lr=0.001, let's increase it to 0.005

the behavior of a recurrent layer with respect to returning a sequence as output or simply using the last output can be specified by setting the argument return_sequences to True or False, respectively.

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
keras.layers.SimpleRNN( 1, input_shape=[None,1] ) # default: use_bias=True, return_sequences=False
])

model.compile( loss="mse", optimizer=optimizer )
history = model.fit( X_train, y_train, epochs=20,
validation_data=(X_valid, y_valid) )

That’s really the simplest RNN you can build. It just contains a single layer, with a single neuron, as we saw in Figure 15-1. We do not need to specify the length of the input sequences (input_shape=[None,1]unlike in the previous model), since a recurrent neural network can process any number of time steps (this is why we set the first input dimension to None). By default, the SimpleRNN layer uses the hyperbolic tangent activation function. It works exactly as we saw earlier: the initial state  is set to 0, and it is passed to a single recurrent neuron(for output), along with the value of the first time step, . The neuron computes a weighted sum of these values                                 (### ht=tf.matmul(xt, w_xh) + b_h ; ot = ht + tf.matmul(prev_output, w_oo) ###)
and applies the hyperbolic tangent activation function to the result (### ot = tf.math.tanh(ot) ###),
and this gives the first output,                               (### out_to_out.append(ot) ###).
In a simple RNN, this output is also the new state  (###prev_output = out_to_out[t-1]###).
This new state is passed to the same recurrent neuron along with the next input value, , and the process is repeated until the last time step. Then the layer just outputs the last value, . All of this is performed simultaneously for every time series.
#######################################
By default, recurrent layers in Keras only return the final output. To make them return one output per time step, you must set return_sequences=True, as we will see.
#######################################

If you compile, fit, and evaluate this model (just like earlier, we train for 20 epochs using Adam), you will find that its MSE reaches only 0.01, so it is better than the naive approach but it does not beat a simple linear model.### val_loss(here is "mse")=0.004145486235618591 ###Note that for each neuron, a linear model has one parameter per input(### input features, here is 1 ###) and per time step, plus a bias term (in the simple linear model we used, that’s a total of 51=50+1 parameters). In contrast, for each recurrent neuron in a simple RNN, there is just one parameter per input(### input features,  here is 1 : input_shape=[None,1]###) and per hidden state dimension
e.g.  h(t) dimension: 1x5
(in a simple RNN ###output-to-output recurrence###, that’s just the number of recurrent neurons in the layer ###  keras.layers.SimpleRNN( 1,...) ###  ), plus a bias term. In this simple RNN, that’s a total of just three parameters.(### The weight coefficients can be obtained during the training process, so they is no longer considered ###)

model.evaluate(X_valid, y_valid)

import matplotlib as mpl

def plot_learning_curves(loss, val_loss):
# the validation error is computed at the end of each epoch,
# while the training error is computed using a running mean during each epoch.
# So the training curve should be shifted by half an epoch to the left.
plt.plot( np.arange( len(loss) )+0.5, loss, "b.-", label="Training loss" )
plt.plot( np.arange( len(val_loss) )+1, val_loss, "r.-", label="Validation loss" )

plt.axis([1, 20, 0, 0.05])###
plt.gca().xaxis.set_major_locator( mpl.ticker.MaxNLocator(integer=False) )
plt.legend( fontsize=14 )
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.grid(True)

plot_learning_curves( history.history["loss"], history.history["val_loss"] )

plt.show()

 # prediction on all data in X_valid but only just focus on [50]
y_pred = model.predict( X_valid )
# first 0: first series,
# second 0: the value is saved in third dimension(1 for univariate time series)
# X_valid.shape = (2000, 50, 1), 2000 series, 50 time steps, 1 for univariate
plot_series( X_valid[0, :, 0], y_valid[0,0], y_pred[0,0] )

#######################################

Trend and Seasonality

There are many other models to forecast time series, such as weighted moving average models (https://blog.csdn.net/Linli522362242/article/details/102314389) or autoregressive integrated moving average (ARIMA) models. Some of them require you to first remove the trend and seasonality. For example,

if you are studying the number of active users on your website, and it is growing by 10% every month, you would have to remove this trend from the time series. Once the model is trained and starts making predictions, you would have to add the trend back to get the final predictions.
Similarly, if you are trying to predict the amount of sunscreen lotion sold every month, you will probably observe strong seasonality: since it sells well every summer, a similar pattern will be repeated every year. You would have to remove this seasonality from the time series, for example by computing the difference between the value at each time step and the value one year earlier (this technique is called differencing). Again, after the model is trained and makes predictions, you would have to add the seasonal pattern back to get the final predictions.
When using RNNs, it is generally not necessary to do all this, but it may improve performance in some cases, since the model will not have to learn the trend or the seasonality.
#######################################

Apparently our simple RNN was too simple to get good performance. So let’s try to add more recurrent layers!

Deep RNNs

It is quite common to stack multiple layers of cells, as shown in Figure 15-7. This gives you a deep RNN.Figure 15-7. Deep RNN (left) unrolled through time (right)

Implementing a deep RNN with tf.keras is quite simple: just stack recurrent layers. In this example, we use three SimpleRNN layers (but we could add any other type of recurrent layer, such as an LSTM layer or a GRU layer, which we will discuss shortly):

#  the last layer use a SimpleRNN layer

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
keras.layers.SimpleRNN( 20, return_sequences=True, input_shape=[None,1] ),
keras.layers.SimpleRNN( 20, return_sequences=True),
keras.layers.SimpleRNN(1)
])

history = model.fit( X_train, y_train, epochs=20,
validation_data=(X_valid, y_valid)
)

###  the last layer use a SimpleRNN layer

##################

Make sure to set return_sequences=True for all recurrent layers (except the last one, if you only care about the last output). If you don’t, they will output a 2D array (containing only the output of the last time step with features,
########
e.g.•  is an  matrix containing the layer’s outputs at time step t for each instance in the mini-batch (m is the number of instances in the mini-batch and  is the number output of neurons).########) instead of a 3D array (containing outputs for all time steps), and the next recurrent layer will complain that you are not feeding it sequences in the expected 3D format.
##################

If you compile, fit, and evaluate this model, you will find that it reaches an MSE of 0.003. We finally managed to beat the linear model ### val_loss(here is "mse")=0.004145486235618591 ### !

model.evaluate( X_valid, y_valid )

plot_learning_curves( history.history["loss"], history.history["val_loss"])

plt.show()

y_pred = model.predict(X_valid)
plot_series( X_valid[0, :, 0], y_valid[0,0], y_pred[0,0] )

plt.show()

Note that the last layer ### keras.layers.SimpleRNN(1) ### is not ideal: it must have a single unit because we want to forecast a univariate time series(there is a single value per time step), and this means we must have a single output value per time step. However, having a single unit means that the hidden state is just a single number. That’s really not much, and it’s probably not that useful; presumably, the RNN will mostly use the hidden states of the other recurrent layers to carry over all the information it needs from time step to time step, and it will not use the final layer’s hidden state very much. Moreover, since a SimpleRNN layer uses the tanh activation function by default, the predicted values must lie within the range –1 to 1. But what if you want to use another activation function? For both these reasons, it might be preferable to replace the output layer with a Dense layer: it would run slightly faster, the accuracy would be roughly the same, and it would allow us to choose any output activation function we want. If you make this change, also make sure to remove return_sequences=True from the second (now last) recurrent layer:

# the last layer use a Dense layer

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None,1]),
keras.layers.SimpleRNN(20), ########### many to one OR sequence-to-vector(vector: features)
keras.layers.Dense(1) # activation=None
])

history = model.fit( X_train, y_train, epochs=20,
validation_data=(X_valid, y_valid)
)

If you train this model, you will see that it converges faster and performs just as well. Plus, you could change the output activation function if you wanted.

model.evaluate(X_valid, y_valid)

plot_learning_curves( history.history["loss"], history.history["val_loss"] )

plt.show()

y_pred = model.predict( X_valid )
plot_series( X_valid[0, :, 0], y_valid[0,0], y_pred[0,0] )

plt.show()

So far we have only predicted the value at the next time step, but we could just as easily have predicted the value several steps ahead by changing the targets appropriately (e.g., to predict 10 steps ahead, just change the targets to be the value 10 steps ahead instead of 1 step ahead). But what if we want to predict the next 10 values?

# RNN predicts next 10 values 1 by 1

The first option is to use the model we already trained, make it predict the next value, then add that value to the inputs (acting as if this predicted value had actually occurred), and use the model again to predict the following value, and so on, as in the following code:

import numpy as np

def generate_time_series(batch_size, n_steps):
freq1, freq2, offsets1, offsets2 = np.random.rand(4, batch_size, 1)
# print(freq1.shape) # (batches,1)
time = np.linspace(0,1, n_steps)
# print(time.shape)  # (n_steps,)

# time-offsets1 shape: (n_steps,)-(batches,1)
# the axes operation is from right to left :
#                         time.shape(n_steps,)         - freq1.shape(batches,1)
#                      ==>time.shape(,n_steps)         - freq1.shape(batches,1)
# broadcast operation  ==>time.shape(batches, n_steps) along row(batches)
#                         - offsets1.shape(batches,n_steps) along column(,n_steps)
#                       result.shape(batches,n_steps)
series = 0.5*np.sin( (time-offsets1)*(freq1*10 + 10) )  # wave 1 in the 1st row
#print(series.shape) # (batches, n_steps+1)
series += 0.2*np.sin( (time-offsets2)*(freq2*20 + 20) ) # +wave 2 in the 2nd row
# print(series.shape) # (batches, n_steps)
series += 0.1*(np.random.rand(batch_size, n_steps)-0.5) # +noise in the 3rd row
# print(series.shape) # (batches, n_steps)

return series[..., np.newaxis].astype( np.float32 )

np.random.seed(43) # not 42, as it would give the first series in the train set

##################################################
# def generate_time_series(batch_size, n_steps)
series = generate_time_series(1, n_steps + 10) # n_steps=50
X_new, Y_new = series[:, :n_steps], series[:, n_steps:] #  the errors might accumulate

X = X_new
# X.shape # (1, 50, 1)

# Forecasting 10 steps ahead, 1 step at a time; then expand dimension to
# [batch size,time steps,dimensions]
y_pred_one = model.predict( X[:, step_ahead:] )[:, np.newaxis, :] ############

X = np.concatenate([X, y_pred_one], axis=1)

Y_pred = X[:, n_steps:]
Y_pred.shape

def plot_multiple_forecasts( X,Y,Y_pred):
n_steps = X.shape[1]

plot_series(X[0, :, 0]) # first series in X
plt.plot( np.arange(n_steps, n_steps+ahead), Y[0, :, 0],
"yo-", label="Actual" )
plt.plot( np.arange(n_steps, n_steps+ahead), Y_pred[0, :, 0],
"bx-", label="Forecast", markersize=10 )

plt.legend( fontsize=14 )

plot_multiple_forecasts( X_new, Y_new, Y_pred )
plt.show()

Figure 15-8. Forecasting 10 steps ahead, 1 step at a time

As you might expect, the prediction for the next step will usually be more accurate than the predictions for later time steps( vs ), since the errors might accumulate (as you can see in Figure 15-8). If you evaluate this approach on the validation set, you will find an MSE of about 0.029. This is much higher than the previous models, but it’s also a much harder task, so the comparison doesn’t mean much. It’s much more meaningful to compare this performance with naive predictions (just forecasting that the time series will remain constant for 10 time steps) or with a simple linear model. The naive approach is terrible (it gives an MSE of about 0.22), but the linear model gives an MSE of about 0.0188: it’s much better than using our RNN to forecast the future one step at a time, and also much faster to train and run. Still, if you only want to forecast a few time steps ahead, on more complex tasks, this approach may work well.

# OR # np.mean( keras.metrics.mean_squared_error( Y_new, Y_pred ))
np.mean( keras.losses.mean_squared_error( Y_new, Y_pred ))

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

np.random.seed(43) # not 42, as it would give the first series in the train set

# def generate_time_series(batch_size, n_steps)
series = generate_time_series(1, n_steps + 10) # n_steps=50
X_new, Y_new = series[:, :n_steps], series[:, n_steps:]

X = X_new
# X.shape # (1, 50, 1)

#use next 10 time_steps for predition then expand dimension to[batch size,time steps,dimensionality]
y_pred_one = model.predict( X )[:, np.newaxis, :]############
X = np.concatenate([X, y_pred_one], axis=1)

Y_pred = X[:, n_steps:]

def plot_multiple_forecasts( X,Y,Y_pred):
n_steps = X.shape[1]

plot_series(X[0, :, 0]) # first series in X
plt.plot( np.arange(n_steps, n_steps+ahead), Y[0, :, 0],
"yo-", label="Actual" )
plt.plot( np.arange(n_steps, n_steps+ahead), Y_pred[0, :, 0],
"bx-", label="Forecast", markersize=10 )

plt.legend( fontsize=14 )

plot_multiple_forecasts( X_new, Y_new, Y_pred )
plt.show()

From the graphic point of view, the forecast trend curve made using the entire data set (including the original data + gradually added data) and the fixed-length data set (the latter part of the original data + the gradually added data) looks the same, but The loss function (mse) is different (since the errors accumulated are different)

np.mean( keras.losses.mean_squared_error( Y_new, Y_pred ))

The reason is the timeliness of the data (the newer the data, the more accurate the forecast  (since the errors accumulated are different))

#################

# np.random.seed(42)
# tf.random.set_seed(42)

# model = keras.models.Sequential([
#     keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None,1]),
#     keras.layers.SimpleRNN(20),
#     keras.layers.Dense(1)
# ])

# history = model.fit( X_train, y_train, epochs=20,
#                      validation_data=(X_valid, y_valid)
#                    )

let's predict the next 10 values one by one:
# the fixed-length data set (the latter part of the original data + the gradually added data((predictions)))

np.random.seed(42)

n_steps = 50
series = generate_time_series(10000, n_steps+10) # n_steps=50
# (7000, 50, 1)          # (7000, 10)
X_train, Y_train = series[:7000, :n_steps], series[:7000, -10:, 0]
X_valid, Y_valid = series[7000:9000, :n_steps], series[7000:9000, -10:, 0]
X_test, Y_test = series[9000:, :n_steps], series[9000:, -10:, 0]
# X_train.shape # (7000, 50, 1)

X = X_valid # (2000, 50, 1)

# Forecasting 10 steps ahead, 1 step at a time; then expand dimension to
# [batch size,time steps,dimensions]
y_pred_one = model.predict(X[:, step_ahead:])[:, np.newaxis, :] ###################
X = np.concatenate( [X, y_pred_one], axis=1 )

Y_pred = X[:, n_steps:, 0]
# Y_pred.shape  # (2000, 10)

np.mean(keras.metrics.mean_squared_error(Y_valid, Y_pred))

The reason is the timeliness of the data (the newer the data, the more accurate the forecast  (since the errors accumulated are different))
VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV

np.random.seed(42)

n_steps = 50
series = generate_time_series(10000, n_steps+10) # n_steps=50
# (7000, 50, 1)          # (7000, 10)
X_train, Y_train = series[:7000, :n_steps], series[:7000, -10:, 0]
X_valid, Y_valid = series[7000:9000, :n_steps], series[7000:9000, -10:, 0]
X_test, Y_test = series[9000:, :n_steps], series[9000:, -10:, 0]
X_train.shape

Now let's predict the next 10 values one by one:

X = X_valid # (2000, 50, 1)

# Forecasting 10 steps ahead, 1 step at a time; then expand dimension to
# [batch size,time steps,dimensions]
y_pred_one = model.predict(X)[:, np.newaxis, :]
X = np.concatenate( [X, y_pred_one], axis=1 )

Y_pred = X[:, n_steps:, 0]
Y_pred.shape # (2000, 10)

As you might expect, the prediction for the next step will usually be more accurate than the predictions for later time steps( vs ), since the errors might accumulate (as you can see in Figure 15-8). If you evaluate this approach on the validation set, you will find an MSE of about 0.027. It’s much more meaningful to compare this performance with naive predictions (just forecasting that the time series will remain constant for 10 time steps) or with a simple linear model. The naive approach is terrible (it gives an MSE of about 0.22), but the linear model gives an MSE of about 0.0188: it’s much better than using our RNN to forecast the future one step at a time, and also much faster to train and run. Still, if you only want to forecast a few time steps ahead, on more complex tasks, this approach may work well.

np.mean(keras.metrics.mean_squared_error(Y_valid, Y_pred))

The reason is the timeliness of the data (the newer the data, the more accurate the forecast)

Let's compare this performance with some baselines: naive predictions and a simple linear model:

# naive predictions

# Y_valid.shape # (2000, 10)
Y_naive_pred = Y_valid[:, -1:] # (2000, 1)
# (2000, 1)
np.mean(keras.metrics.mean_squared_error(Y_valid, Y_naive_pred))

# the linear model

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[50,1]),
keras.layers.Dense(10)
])

history = model.fit(X_train, Y_train, epochs=20,
validation_data=(X_valid, Y_valid)
)

The second option is to train an RNN to predict all 10 next values at once. We can still use a sequence-to-vector model(), but it will output 10 values instead of 1. However, we first need to change the targets to be vectors containing the next 10 values:

# np.random.seed(42)

# n_steps = 50
# series = generate_time_series(10000, n_steps+10) # n_steps=50
# (7000, 50, 1)          # (7000, 10)
# X_train, Y_train = series[:7000, :n_steps], series[:7000, -10:, 0]
# X_valid, Y_valid = series[7000:9000, :n_steps], series[7000:9000, -10:, 0]
# X_test, Y_test = series[9000:, :n_steps], series[9000:, -10:, 0]

# RNN predicts all 10 next values(target batch_size x 10) at once and only at the very last time step:

### keras.layers.SimpleRNN(20, return_sequences=False) ###
Now we just need the output layer to have 10 units instead of 1 (### keras.layers.Dense(10) ###):

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None,1]),
keras.layers.SimpleRNN(20), # many-to-1 : sequence-to-vector(vector: features)
keras.layers.Dense(10)      # sequence is a time series(with "many" time steps)
])                              # 1: last 1 time step with multiple features, here just one feature

history = model.fit(X_train, Y_train, epochs=20,
validation_data=(X_valid, Y_valid))

After training this model, you can predict the next 10 values at once very easily:

np.random.seed(43)

series = generate_time_series(1, 50+10) # 1 instance with 50+10 time steps
X_new, Y_new = series[:, :50, :], series[:, -10:, :]
Y_pred = model.predict(X_new)[..., np.newaxis] # prediction then expand dimension to [batch_size, steps, features]
Y_pred

plot_multiple_forecasts(X_new, Y_new, Y_pred)

plt.show()

This model works nicely: the MSE for the next 10 time steps is about 0.008(). That’s much better than the linear model(). But we can still do better: indeed, instead of training the model to forecast the next 10 values only at the very last time step, we can train it to forecast the next 10 values at each and every time step.

forecast the next 10 values at each and every time step

In other words, we can turn this sequence-to-vector RNN into a sequence-to-sequence RNN. The advantage of this technique is that

the loss will contain a term for the output of the RNN at each and every time step, not just the output at the last time step.
This means there will be many more error gradients flowing through the model, and they won’t have to flow only through time; they will also flow from the output of each time step.
This will both stabilize and speed up training.
Now let's create an RNN that predicts the next 10 steps at each time step. That is, instead of just forecasting time steps 50 to 59 based on time steps 0 to 49, it will forecast time steps 1 to 10 at time step 0, then time steps 2 to 11 at time step 1, and so on, and finally it will forecast time steps 50 to 59 at the last time step. (Notice that the model is causal: when it makes predictions at any time step, it can only see past time steps). So each target must be a sequence of the same length as the input sequence, containing a 10-dimensional vector at each step.

Let’s prepare these target sequences:

np.random.seed(42)

n_steps = 50
series = generate_time_series(10000, n_steps+10) # 10000x60x1
X_train = series[:7000, :n_steps] # 7000x50x1
X_valid = series[7000:9000, :n_steps]
X_test = series[9000:, :n_steps]

Y = np.empty( (10000, n_steps, 10) ) #  # 10000x50x10
for step_ahead in range(1, 10+1): # Y :0     1     2          9     #[,rows=n_steps, column_index]
# Y[..., 0~9]                    # 1~50, 2~51, 3~52, ..., 10~59     # row range at each loop

Y_train = Y[:7000]
Y_valid = Y[7000:9000]
Y_test = Y[9000:]

X_train.shape, Y_train.shape

#########################

If a sequence : Deep ... Learni...

first time step in X: D

--> (Horizontal) Column2nd time step in X: e

--> (Horizontal) Column

series[0, 0:1+n_steps, 0] # n steps=50

series[0, 1:1+n_steps, 0] # n steps=50

Y[0,...,1-1]

Y[0,...,2-1]

Y[0,0] # first time step

# X_train: [batch_size, steps, features]
X_train[0,0] # the first time step in the first instance

Y_train[0,0] # the first time step in the first target

# X_train: [batch_size, steps, features]
X_train[0,1] # the 2nd time step in the first instance

Y_train[0,1] # the 2nd time step in the first target

#########################
####################################################

It may be surprising that the targets will contain values that appear in the inputs (there is a lot of overlap between X_train and Y_train). Isn’t that cheating? Fortunately, not at all: at each time step, the model only knows about past time steps, so it cannot look ahead. It is said to be a causal model.
####################################################TimeDistributed layer

To turn the model into a sequence-to-sequence model, we must set return_sequences=True in all recurrent layers (even the last one), and we must apply the output Dense layer at every time step. Keras offers a
TimeDistributed layer for this very purpose: it wraps any layer (e.g., a Dense layer) and applies it at every time step of its input sequence. It does this efficiently, by reshaping the inputs so that each time step is treated as a separate instance

(i.e., it reshapes the inputs from [batch size, time steps, input dimensions] to [batch size × time steps, input dimensions]; in this example, the number of input dimensions is 20 because the previous SimpleRNN layer has 20 units),
then it runs the Dense layer, and
finally it reshapes the outputs back to sequences (i.e., it reshapes the outputs from [batch size × time steps, output dimensions] to [batch size, time steps, output dimensions]; in this example the number of output dimensions is 10, since the Dense layer has 10 units).### Note that a TimeDistributed(Dense(n)) layer is equivalent to a Conv1D(n, filter_size=1) layer. ###
Here is the updated model:

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
keras.layers.SimpleRNN( 20, return_sequences=True, input_shape=[None, 1] ),
keras.layers.SimpleRNN( 20, return_sequences=True ),#forecast the next 10 values at each and every time step
keras.layers.TimeDistributed( keras.layers.Dense(10) )
])

def last_time_step_mse( Y_true, Y_pred):  # ":" represents all instances, "-1" is last time step
return keras.metrics.mean_squared_error( Y_true[:, -1], Y_pred[:, -1] )

metrics = [last_time_step_mse] )

history = model.fit( X_train, Y_train, epochs=20,
validation_data=(X_valid, Y_valid) )

The Dense layer actually supports sequences as inputs (and even higher-dimensional inputs): it handles them just like TimeDistributed(Dense(…)), meaning it is applied to the last input dimension only (independently across all time steps). Thus, we could replace the last layer with just Dense(10). For the sake of clarity, however, we will keep using TimeDistributed(Dense(10)) because it makes it clear that the Dense layer is applied independently at each time step and that the model will output a sequence[batch size, time steps, output dimensions], not just a single vector.

All outputs are needed during training, but only the output at the last time step is useful for predictions and for evaluation. So although we will rely on the MSE over all the outputs for training, we will use a custom metric for evaluation, to only compute the MSE over the output at the last time step.

We get a validation MSE of about 0.006, which is 25%(1-0.006/0.008=1-0.75=0.25) better than the previous model. You can combine this approach with the first one: just predict the next 10 values using this RNN, then concatenate these values to the input time series and use the model again to predict the next 10 values, and repeat the process as many times as needed. With this approach, you can generate arbitrarily long sequences. It may not be very accurate for long-term predictions, but it may be just fine if your goal is to generate original music or text, as we will see in Chapter 16.

np.random.seed(43)

series = generate_time_series(1, 50+10) # create 1 instance with 50+10 time steps
X_new, Y_new = series[:, :50, :], series[:, 50:, :] # first 50 time steps as X_new for prediction,
# the last 10 time steps as actual Y
# ":" represents all instances, "-1" is last time step
# model.predict(X_new)[:, -1] is a 1D list then expand its dimention to 2D
Y_pred = model.predict(X_new)[:, -1][..., np.newaxis]
Y_pred

plot_multiple_forecasts(X_new, Y_new, Y_pred)
plt.show()

Simple RNNs can be quite good at forecasting time series or handling other kinds of sequences, but they do not perform as well on long time series or sequences. Let’s discuss why and see what we can do about it.

Handling Long Sequences

To train an RNN on long sequences, we must run it over many time steps, making the unrolled RNN a very deep network. Just like any deep neural network it may suffer from the unstable gradients problem( This is when the gradients grow smaller and smaller, or larger and larger), discussed in Chapter 11 : https://blog.csdn.net/Linli522362242/article/details/106935910: it may take forever to train, or training may be unstable. Moreover, when an RNN processes a long sequence, it will gradually forget the first inputs in the sequence. Let’s look at both these problems, starting with the unstable gradients problem.

Many of the tricks we used in deep nets to alleviate减轻 the unstable gradients problem can also be used for RNNs: good parameter initialization, faster optimizers, dropout, and so on. However, nonsaturating activation functions (e.g., ReLU) may not help as much here; in fact, they may actually lead the RNN to be even more unstable during training. Why? Well, suppose Gradient Descent updates the weights in a way that increases the outputs slightly at the first time step. Because the same weights are used at every time step, the outputs at the second time step may also be slightly increased, and those at the third, and so on until the outputs explode—and a nonsaturating activation function does not prevent that. You can reduce this risk by using a smaller learning rate, but you can also simply use a saturating activation function like the hyperbolic tangent (this explains why it is the default). In much the same way, the gradients themselves can explode. If you notice that training is unstable, you may want to monitor the size of the gradients (e.g., using TensorBoard) and perhaps use Gradient Clippinghttps://blog.csdn.net/Linli522362242/article/details/106935910.

Moreover, Batch Normalization(https://blog.csdn.net/Linli522362242/article/details/106935910) cannot be used as efficiently with RNNs as with deep feedforward nets. In fact, you cannot use it between time steps, only between recurrent layers. To be more precise, it is technically possible to add a BN layer to a memory cell (as we will see shortly) so that it will be applied at each time step (both on the inputs for that time step and on the hidden state from the previous step). However, the same BN layer will be used at each time step, with the same parameters, regardless of the actual scale and offset of the inputs and hidden state. In practice, this does not yield good results, as was demonstrated by César Laurent et al. in a 2015 paper:3 the authors found that BN was slightly beneficial only when it was applied to the inputs, not to the hidden states. In other words, it was slightly better than nothing when applied between recurrent layers (i.e., vertically in Figure 15-7), but not within recurrent layers (i.e., horizontally). In Keras this can be done simply by adding a Batch Normalization layer before each recurrent layer, but don’t expect too much from it.
###########
It is quite common to stack multiple layers of cells, as shown in Figure 15-7. This gives you a deep RNN.Figure 15-7. Deep RNN (left) unrolled through time (right)
###########

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None,1] ),

keras.layers.BatchNormalization(), # was applied to the inputs
keras.layers.SimpleRNN(20, return_sequences=True),

keras.layers.BatchNormalization(),
keras.layers.TimeDistributed( keras.layers.Dense(10) ) # output: 10 next 10 time steps' value
])

history = model.fit( X_train, Y_train, epochs=20,
validation_data=(X_valid, Y_valid)
)

Another form of normalization often works better with RNNs: Layer Normalization. This idea was introduced by Jimmy Lei Ba et al. in a 2016 paper(Jimmy Lei Ba et al., “Layer Normalization,” arXiv preprint arXiv:1607.06450 (2016).): it is very similar to Batch Normalization, but instead of normalizing across the batch dimension, it normalizes across the features dimension. One advantage is that it can compute the required statistics on the fly, at each time step, independently for each instance. This also means that it behaves the same way during training and testing
(### as opposed to BN,https://blog.csdn.net/Linli522362242/article/details/106935910

Batch Normalization 是对这批所有样本的各个(independent)特征维度分别做归一化（6次）， Layer Normalization 是对这单个样本的所有(across)特征维度做归一化（3次）

将输入的图像shape记为[N, C, H, W]，这几个方法主要的区别就是在N_y 表示样本轴，  C_x表示通道轴，  F_z是每个通道的特征数量（here is 2 since [H,W]）。

BN是先一个通道（在多层神经元中就是某一层的纬度或者神经元的节点数目），接着在同一批所有样本的的一个(independent)特征，一个(independent)特征的做归一化, （或者说在batch上，对NHW做归一化），然后在下一个通道（对小batchsize效果不好）LN则是先一个样本，一个通道的所有(across)特征维度，一个通道的所有(across)特征维度做归一化。（或者说对CHW做归一化）,然后下一个样本 https://blog.csdn.net/Linli522362242/article/details/107596098

Equation 11-3. Batch Normalization algorithmhttps://blog.csdn.net/Linli522362242/article/details/106935910
Durint training: the Batch Normalization algorithm needs to estimate each input’s mean and standard deviation. It does so by evaluating the mean and standard deviation of the input over the current mini-batch. So during training, BN standardizes its inputs, then rescales and offsets them.

at test time:
One solution could be to wait until the end of training, then run the whole training set through the neural network and compute the mean and standard deviation of each input of the BN layer. These “final” input means and standard deviations could then be used instead of the batch input means and standard deviations when making predictions.
However, most implementations of Batch Normalization estimate these final statistics during training by using a moving average of the layer’s input means and standard deviations. This is what Keras does automatically when you use the BatchNormalization layer. To sum up, four parameter vectors are learned in each batch-normalized layer: γ (the output scale vector) and
β (the output offset vector) are learned through regular backpropagation, and
μ (the final input mean vector) and
σ (the final input standard deviation vector) are estimated using an exponential moving average. Note that μ and σ are estimated during training, but they are used only after training (to replace the batch input means and standard deviations in Equation 11-3).
### ), and it does not need to use exponential moving averages to estimate the feature statistics across all instances in the training set. Like BN, Layer Normalization learns a scale and an offset parameter for each input. In an RNN, it is typically used right after the linear combination of the inputs and the hidden states.

Let’s use tf.keras to implement Layer Normalization within a simple memory cell. For this, we need to define a custom memory cell. It is just like a regular layer, except its

call() method takes two arguments: the inputs at the current time step and the hidden states from the previous time step. Note that the states argument is a list containing one or more tensors.

In the case of a simple RNN cell it contains a single tensor equal to the outputs of the previous time step, but other cells may have multiple state tensors (e.g., an LSTMCell has a long-term state and a short-term state, as we will see shortly). A cell must also have a state_size attribute and an output_size attribute. In a simple RNN, both are simply equal to the number of units.

The following code implements a custom memory cell which will behave like a SimpleRNNCell, except it will also apply Layer Normalization at each time step:
from tensorflow.keras.layers import LayerNormalization

class LNSimpleRNNCell( keras.layers.Layer ):
def __init__( self, units, activation="tanh", **kwargs ):
super().__init__( **kwargs )
self.state_size = units
self.output_size = units
self.simple_rnn_cell = keras.layers.SimpleRNNCell( units, # Positive integer, dimensionality of the output space.
activation=None
)# processes one step within the whole time sequence input
# tf.keras.layers.LayerNormalization
#                Normalize the activations of the previous layer for "each given example" in a batch "independently",
#                rather than "across a batch" like Batch Normalization.
#                i.e. applies a transformation that maintains the mean activation "within each example" close to 0
#                     and the activation standard deviation close to 1.
# within each example : across all feature dimensions do normalization
self.layers_norm = LayerNormalization()
self.activation = keras.activations.get(activation)

def get_initial_state( self, inputs=None, batch_size=None, dtype=None ):
if inputs is not None:
batch_size = tf.shape(inputs)[0]
dtype = inputs.dtype
return [tf.zeros([batch_size, self.state_size])]

def call( self, inputs, states):
# keras.layers.SimpleRNNCell
#                       processes one step within the whole time sequence input,
#                       whereas tf.keras.layer.SimpleRNN processes the whole sequence.
#                inputs: A 2D tensor, with shape of [batch, feature].
#                states: A 2D tensor with shape of [batch, units], which is the state from the previous time step.
#                       For timestep 0, the initial state provided by user will be feed to cell.
#                       the states argument is a list containing one or more tensors.
# simple RNN cell, which computes a linear combination of the current inputs and the previous hidden states,
# and it returns the result twice (indeed, in a SimpleRNNCell, the outputs are just equal to the hidden states
# states: in other words, new_states[0] is equal to outputs
outputs, new_states = self.simple_rnn_cell(inputs, states)
# perform Layer Normalization before the activation function
norm_outputs = self.activation( self.layers_norm(outputs) )
return norm_outputs, [norm_outputs] # once as the outputs, and once as the new hidden states

The code is quite straightforward (It would have been simpler to inherit from SimpleRNNCell instead so that we wouldn’t have to create an internal SimpleRNNCell or handle the state_size and output_size attributes, but the goal here was to show how to create a custom cell from scratch.). Our LNSimpleRNNCell class inherits from the keras.layers.Layer class, just like any custom layer.

The constructor takes the number of units and the desired activation function, and
it sets the state_size and output_size attributes, then
creates a SimpleRNNCell with no activation function (because we want to perform Layer Normalization after the linear operation but before the activation function). Then
the constructor creates the LayerNormalization layer, and finally it fetches the desired activation function.

The call() method starts by applying
the simple RNN cell, which computes a linear combination of the current inputs and the previous hidden states, and it returns the result twice (indeed, in a SimpleRNNCell, the outputs are just equal to the hidden states: in other words, new_states[0] is equal to outputs, so we can safely ignore new_states in the rest of the call() method).
Next, the call() method applies Layer Normalization,
followed by the activation function.
Finally, it returns the outputs twice (once as the outputs, and once as the new hidden states).

To use this custom cell, all we need to do is create a keras.layers.RNN layer, passing it a cell instance:

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
# simple_rnn_cell==>layers_norm==>activation==>simple_rnn_cell==>...
keras.layers.RNN( LNSimpleRNNCell(20), return_sequences=True,
input_shape=[None,1]),                       # return_sequences=True
keras.layers.RNN( LNSimpleRNNCell(20), return_sequences=True ),# forecast the next 10 values at each and every time step
keras.layers.TimeDistributed( keras.layers.Dense(10) ) # output: next 10 time steps' value
])

def last_time_step_mse( Y_true, Y_pred):  # ":" represents all instances, "-1" is last time step
return keras.metrics.mean_squared_error( Y_true[:, -1], Y_pred[:, -1] )

history = model.fit( X_train, Y_train, epochs=20,
validation_data=(X_valid, Y_valid)
)

Similarly, you could create a custom cell to apply dropout between each time step. But there’s a simpler way: all recurrent layers (except for keras.layers.RNN) and all cells provided by Keras have a dropout hyperparameter and a recurrent_dropout hyperparameter: the former defines the dropout rate to apply to the inputs (at each time step), and the latter defines the dropout rate for the hidden states (also at each time step). No need to create a custom cell to apply dropout at each time step in an RNN.

With these techniques, you can alleviate the unstable gradients problem and train an RNN much more efficiently. Now let’s look at how to deal with the short-term memory problem.

Creating a Custom RNN Class

X_train.shape

class MyRNN( keras.layers.Layer ):
def __init__( self, cell, return_sequences=False, **kwargs):
super().__init__(**kwargs)
self.cell = cell                                           # <== LNSimpleRNNCell(20)
self.return_sequences = return_sequences
self.get_initial_state = getattr(
self.cell, "get_initial_state",                        # <== LNSimpleRNNCell.get_initial_state
self.fallback_initial_state # If the named attribute("get_initial_state") does not exist, default is returned if provided
)

def fallback_initial_state( self, inputs ):
return [ tf.zeros([self.cell.state_size],                   # LNSimpleRNNCell.state_size=units
dtype=inputs.dtype) ]

@tf.function
def call( self, inputs ):
states = self.get_initial_state(inputs)
n_steps = tf.shape( inputs )[1] # 50 time steps
if self.return_sequences:
sequences = tf.TensorArray(inputs.dtype, size=n_steps)

outputs = tf.zeros( shape=[n_steps, self.cell.output_size],  # LNSimpleRNNCell.output_size=units
dtype=inputs.dtype)
for step in tf.range(n_steps):
# similar to outputs, new_states = self.simple_rnn_cell(inputs, states)
outputs, states = self.cell( inputs[:, step], states )
if self.return_sequences:
sequences = sequences.write(step, outputs) # sequences[step]=outputs
if self.return_sequences:
# https://blog.csdn.net/z2539329562/article/details/80639199
return sequences.stack() # All the tensors in the TensorArray stacked into one tensor.
else:
return outputs

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
MyRNN( LNSimpleRNNCell(20), return_sequences=True,
input_shape=[None,1]),
MyRNN( LNSimpleRNNCell(20), return_sequences=True),
keras.layers.TimeDistributed( keras.layers.Dense(10) )
])

model.compile( loss="mse", optimizer="adam", metrics = [last_time_step_mse])
history = model.fit(X_train, Y_train, epochs=20,
validation_data=(X_valid, Y_valid))

Tackling the Short-Term Memory Problem

Due to the transformations that the data goes through when traversing an RNN, some information is lost at each time step. After a while, the RNN’s state contains virtually no trace of the first inputs. This can be a showstopper. Imagine Dory the fish(A character from the animated movies Finding Nemo and Finding Dory海底总动员 who has short-term memory loss.) trying to translate a long sentence; by the time she’s finished reading it, she has no clue how it started. To tackle this problem, various types of cells with long-term memory have been introduced. They have proven so successful that the basic cells are not used much anymore. Let’s first look at the most popular of these long-term memory cells: the LSTM cell.

LSTM cells

The Long Short-Term Memory (LSTM) cell was proposed in 1997(Sepp Hochreiter and Jürgen Schmidhuber, “Long Short-Term Memory,” Neural Computation 9, no. 8 (1997):1735–1780) by Sepp Hochreiter and Jürgen Schmidhuber and gradually improved over the years by several researchers, such as Alex Graves, Haşim Sak(Haşim Sak et al., “Long Short-Term Memory Based Recurrent Neural Network Architectures for Large Vocabulary Speech Recognition,” arXiv preprint arXiv:1402.1128 (2014).), and Wojciech Zaremba(Wojciech Zaremba et al., “Recurrent Neural Network Regularization,” arXiv preprint arXiv:1409.2329 (2014).). If you consider the LSTM cell as a black box, it can be used very much like a basic cell, except it will perform much better; training will converge faster, and it will detect long-term dependencies in the data. In Keras, you can simply use the LSTM layer instead of the SimpleRNN layer:

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
keras.layers.LSTM(20, return_sequences=True, input_shape=[None, 1]),
keras.layers.LSTM(20, return_sequences=True),
keras.layers.TimeDistributed( keras.layers.Dense(10) )
])

history = model.fit( X_train, Y_train, epochs=20,
validation_data=(X_valid, Y_valid)
)

Alternatively, you could use the general-purpose keras.layers.RNN layer, giving it an LSTMCell as an argument:

model = keras.models.Sequential([
keras.layers.RNN( keras.layers.LSTMCell(20), return_sequences=True,
input_shape=[None, 1]
),
keras.layers.RNN(keras.layers.LSTMCell(20), return_sequences=True),
keras.layers.TimeDistributed(keras.layers.Dense(10))
])

Y_train.shape

model.evaluate( X_valid, Y_valid )

plot_learning_curves( history.history["loss"],
history.history["val_loss"]
)
plt.show()

np.random.seed(43)

series = generate_time_series(1, 50+10)
X_new, Y_new = series[:, :50, :], series[:, 50:, :]
Y_pred = model.predict( X_new )[:,-1][..., np.newaxis]
Y_pred

plot_multiple_forecasts(X_new, Y_new, Y_pred)
plt.show()

However, the LSTM layer uses an optimized implementation when running on a GPU (see Chapter 19), so in general it is preferable to use it (the RNN layer is mostly useful
when you define custom cells, as we did earlier).

So how does an LSTM cell work? Its architecture is shown in Figure 15-9.

If you don’t look at what’s inside the box, the LSTM cell looks exactly like a regular cell, except that its state is split into two vectors:  and  (“c” stands for “cell”). You
can think of  as the short-term state and  as the long-term state.

Figure 15-9. LSTM cell https://blog.csdn.net/Linli522362242/article/details/113846940OR

Now let’s open the box! The key idea is that the network can learn what to store in the long-term state, what to throw away, and what to read from it. As the long-term state traverses the network from left to right, you can see that it first goes through a forget gate  OR , dropping some memories, and then it adds some new memories via the addition operation OR (which adds the memories that were selected by an input gate OR). The result  is sent straight out, without any further transformation. So, at each time step, some memories are dropped and some memories are added. Moreover, after the addition operation, the long-term state is copied and passed through the tanh function, and then the result is filtered by the output gateOR. This produces the short-term state  (which is equal to the cell’s output for this time step, ). Now let’s look at where new memories come from and how the gates work.

First, the current input vector  and the previous short-term state  are fed to four different fully connected layers. They all serve a different purpose:

• The main layer is the one that outputs . It has the usual role of analyzing the current inputs  and the previous (short-term) state . In a basic cell, there is nothing other than this layer, and its output goes straight out to  and . In contrast, in an LSTM cell this layer’s output does not go straight out, but instead its most important parts are stored in the long-term state (and the rest is dropped).
• The three other layers are gate controllers. Since they use the logistic activation function, their outputs range from 0 to 1. As you can see, their outputs are fed to element-wise multiplication operations, so if they output 0s they close the gate, and if they output 1s they open it. Specifically:—The forget gate  OR (controlled by ) controls which parts of the long-term state should be erased.)
(This gate decides what information should be thrown away or kept. Information from the previous hidden state and information from the current input is passed through the sigmoid function. Values come out between 0 and 1. The closer to 0 means to forget, and the closer to 1 means to keep. OR This has a possibility of dropping values in the cell state if it gets pointwise multiplied by values near 0.).
Now,  is computed as follows:  OR
—The input gate OR(controlled by ) controls which parts of  should be added to the long-term state.)

First, we pass the previous hidden state and current input into a sigmoid function. That decides which values will be updated by transforming the values to be between 0 and 1. 0 means not important, and 1 means important. OR
You also pass the hidden state and current input into the tanh functionOR to squish values between -1 and 1 to help regulate the network.
Then you multiply the tanh output with the sigmoid output. The sigmoid output will decide which information is important to keep from the tanh output.OR.The cell state at time t is computed as follows: OR           do a pointwise addition which updates the cell state to new values that the neural network finds relevant. That gives us our new cell state.          after the addition operation, the long-term state is copied and passed through the tanh function, and then the result is filtered by the output gate
—Finally, the output gate (controlled by ) controls which parts of the longterm state should be read and output at this time step, both to  and to .
First, we pass the previous hidden state and the current input into a sigmoid function.
OR
Then we pass the newly modified cell state to the tanh function.
We multiply the tanh output with the sigmoid output to decide what information the hidden state should carry. The output is the hidden state. The new cell state and the new hidden is then carried over to the next time step.
In short, an LSTM cell can learn to recognize an important input (that’s the role of the input gate), store it in the long-term state, preserve it for as long as it is needed (that’s the role of the forget gate), and extract it whenever it is needed. This explains why these cells have been amazingly successful at capturing long-term patterns in time series, long texts, audio recordings, and more.

Equation 15-3 summarizes how to compute the cell’s long-term state, its short-term state, and its output at each time step for a single instance (the equations for a whole
mini-batch are very similar).Equation 15-3. LSTM computations https://blog.csdn.net/Linli522362242/article/details/113846940
In this equation:

• , , ,  are the weight matrices of each of the four layers for their connection to the input vector .
•  are the weight matrices of each of the four layers for their connection to the previous short-term state .
•  are the bias terms for each of the four layers. Note that TensorFlow initializes to a vector full of 1s instead of 0s. This prevents forgetting everything at the beginning of training.
Peephole connections

In a regular LSTM cell, the gate controllers can look only at the input  and the previous short-term state . It may be a good idea to give them a bit more context by
letting them peek at the long-term state as well. This idea was proposed by Felix Gers and Jürgen Schmidhuber in 2000.(F. A. Gers and J. Schmidhuber, “Recurrent Nets That Time and Count,” Proceedings of the IEEE-INNS-ENNS International Joint Conference on Neural Networks (2000): 189–194.) They proposed an LSTM variant with extra connections called peephole connections: the previous long-term state  is added as an input to the controllers of the forget gate and the input gate, and the current longterm
state  is added as input to the controller of the output gate. This often improves performance, but not always, and there is no clear pattern for which tasks are better off with or without them: you will have to try it on your task and see if it helps.

In Keras, the LSTM layer is based on the keras.layers.LSTMCell cell, which does not support peepholes. The experimental tf.keras.experimental.PeepholeLSTMCell
does, however, so you can create a keras.layers.RNN layer and pass a PeepholeLSTM Cell to its constructor.

There are many other variants of the LSTM cell. One particularly popular variant is the GRU cell, which we will look at now.

GRU cells

https://zhuanlan.zhihu.com/p/83220665

The Gated Recurrent Unit (GRU) cell (see Figure 15-10) was proposed by Kyunghyun Cho et al. in a 2014 paper(Kyunghyun Cho et al., “Learning Phrase Representations Using RNN Encoder-Decoder for Statistical Machine Translation,” Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (2014): 1724–1734.) that also introduced the Encoder–Decoder network we discussed earlier.Figure 15-10. GRU cell https://blog.csdn.net/Linli522362242/article/details/113846940https://d2l.ai/chapter_recurrent-modern/gru.html==VS

Equation 15-4 summarizes how to compute the cell’s state at each time step for a single instance.OR
where , (number of hidden units: h)
and, are weight parameters
and ,  are biases. Note that broadcasting is triggered during the summation. We use sigmoid functions to transform input values to the interval (0,1).

Variables : The original is a sigmoid function. and  : The original is a hyperbolic tangent ( to ensure that the values in the candidate hidden state(OR) remain in the interval (−1,1).

for a given time step t

: input vector   (number of examples: n, number of inputs: d)
: output vector (number of hidden units: h)
: update gate vector ( <== input <== and  ### hidden: bias )
: reset gate vector    ( <== input <== and  ### hidden: bias) ==> OR
OR : candidate activation vector ( <== input <==, )
W, U and b: parameter matrices and vector
I use  to explain:
If the update gate controller outputs a 1, the forget gate is open (= 1, then ) and the input gate is closed (1 – 1 = 0).
If update gateoutputs a 0, the opposite happens.

Comparing with ,now the influence of the previous statescan be reduced with the elementwise multiplication
I use  to explain:
Whenever the entries in the reset gate  are close to 1, we recover a RNN such as in
For all entries of the reset gate  that are close to 0, the candidate hidden state((OR)) is the result of an MLP(Multilayer Perceptron) with  as the input. Any pre-existing hidden state is thus reset to defaultshttps://d2l.ai/chapter_recurrent-modern/gru.html

The GRU cell is a simplified version of the LSTM cell, and it seems to perform just as well (A 2015 paper by Klaus Greff et al., “LSTM: A Search Space Odyssey”, seems to show that all LSTM variants perform roughly the same.) (which explains its growing popularity). These are the main simplifications:

First Figure:

• Both state vectors( and ) are merged into a single vector .
• A single gate(update gate) controller controls both the forget gate and the input gate
If the gate controller outputs a 1, the forget gate is open (= 1, then ) and the input gate is closed (1 – 1 = 0).
If it outputs a 0, the opposite happens.
In other words, whenever a memory must be stored, the location where it will be stored is erased first. This is actually a frequent variant to the LSTM cell in and of itself.
• There is no output gate; the full state vector is output at every time step. However, there is a new gate controller (reset gate)  that controls which part of the previous state will be shown to the main layer ().
Keras provides a keras.layers.GRU layer (based on the keras.layers.GRUCell memory cell); using it is just a matter of replacing SimpleRNN or LSTM with GRU.

def generate_time_series(batch_size, n_steps):
freq1, freq2, offsets1, offsets2 = np.random.rand(4, batch_size, 1)
# print(freq1.shape) # (batches,1)
time = np.linspace(0,1, n_steps)
# print(time.shape)  # (n_steps+1,)

# time-offsets1 shape: (n_steps+1,)-(batches,1)
# the axes operation is from right to left :
#                         time.shape(n_steps+1,)         - freq1.shape(batches,1)
#                      ==>time.shape(,n_steps+1)         - freq1.shape(batches,1)
# broadcast operation  ==>time.shape(batches, n_steps+1) along row(batches)
#                         - offsets1.shape(batches,n_steps+1) along column(,n_steps+1)
#                       result.shape(batches,n_steps+1)
series = 0.5*np.sin( (time-offsets1)*(freq1*10 + 10) )  # wave 1 in the 1st row
#print(series.shape) # (batches, n_steps+1)
# += : series.extend()
series += 0.2*np.sin( (time-offsets2)*(freq2*20 + 20) )  # +wave 2 in the 2nd row
# print(series.shape) # (batches, n_steps+1)
series += 0.1*(np.random.rand(batch_size, n_steps)-0.5) # +noise in the 3rd row
# print(series.shape) # (batches, n_steps+1)

return series[..., np.newaxis].astype( np.float32 )

np.random.seed(42)

n_steps = 50
series = generate_time_series(10000, n_steps+10) # 10000x60x1
X_train = series[:7000, :n_steps] # 7000x50x1
X_valid = series[7000:9000, :n_steps]
X_test = series[9000:, :n_steps]

Y = np.empty( (10000, n_steps, 10) ) #  # 10000x50x10
for step_ahead in range(1, 10+1): # Y :0     1     2          9     #[,rows=n_steps, column_index]
# Y[..., 0~9]                    # 1~50, 2~51, 3~52, ..., 10~59     # row range at each loop

Y_train = Y[:7000]
Y_valid = Y[7000:9000]
Y_test = Y[9000:]

X_train.shape, Y_train.shape

import tensorflow as tf
from tensorflow import keras

np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
keras.layers.GRU(20, return_sequences=True, input_shape=[None,1]),
keras.layers.GRU(20, return_sequences=True),
keras.layers.TimeDistributed(keras.layers.Dense(10))
])

def last_time_step_mse( Y_true, Y_pred):  # ":" represents all instances, "-1" is last time step
return keras.metrics.mean_squared_error( Y_true[:, -1], Y_pred[:, -1] )

history = model.fit(X_train, Y_train, epochs=20, validation_data=(X_valid, Y_valid))

model.evaluate(X_valid, Y_valid)

import matplotlib as mpl
import matplotlib.pyplot as plt

def plot_learning_curves(loss, val_loss):
# the validation error is computed at the end of each epoch,
# while the training error is computed using a running mean during each epoch.
# So the training curve should be shifted by half an epoch to the left.
plt.plot( np.arange( len(loss) )+0.5, loss, "b.-", label="Training loss" )
plt.plot( np.arange( len(val_loss) )+1, val_loss, "r.-", label="Validation loss" )

plt.axis([0, 20, 0, 0.05])
plt.gca().xaxis.set_major_locator( mpl.ticker.MaxNLocator(integer=False) )
plt.legend( fontsize=14 )
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.grid(True)

plot_learning_curves(history.history["loss"], history.history["val_loss"])

plt.show()

def generate_time_series(batch_size, n_steps):
freq1, freq2, offsets1, offsets2 = np.random.rand(4, batch_size, 1)
# print(freq1.shape) # (batches,1)
time = np.linspace(0,1, n_steps)
# print(time.shape)  # (n_steps+1,)

# time-offsets1 shape: (n_steps+1,)-(batches,1)
# the axes operation is from right to left :
#                         time.shape(n_steps+1,)         - freq1.shape(batches,1)
#                      ==>time.shape(,n_steps+1)         - freq1.shape(batches,1)
# broadcast operation  ==>time.shape(batches, n_steps+1) along row(batches)
#                         - offsets1.shape(batches,n_steps+1) along column(,n_steps+1)
#                       result.shape(batches,n_steps+1)
series = 0.5*np.sin( (time-offsets1)*(freq1*10 + 10) )  # wave 1 in the 1st row
#print(series.shape) # (batches, n_steps+1)
# += : series.extend()
series += 0.2*np.sin( (time-offsets2)*(freq2*20 + 20) )  # +wave 2 in the 2nd row
# print(series.shape) # (batches, n_steps+1)
series += 0.1*(np.random.rand(batch_size, n_steps)-0.5) # +noise in the 3rd row
# print(series.shape) # (batches, n_steps+1)

return series[..., np.newaxis].astype( np.float32 )

np.random.seed(43)

series = generate_time_series(1, 50+10)
X_new, Y_new = series[:, :50, :], series[:, 50:, :]
Y_pred = model.predict( X_new )[:, -1][..., np.newaxis] #-1: the last time step with features(=1),
Y_pred                               #[batch size, time steps, 1]

def plot_series( series,
y=None,
y_pred=None,
x_label="$t$", y_label="$x(t)$"
):
plt.plot( series, ".-")
if y is not None:
plt.plot( n_steps, y, "bx", markersize=10)
if y_pred is not None:
plt.plot( n_steps, y_pred, "ro", markersize=10)

if x_label:
plt.xlabel(x_label, fontsize=16)
if y_label:

plt.hlines(0, 0, n_steps+1, linewidth=1)
plt.axis([0, n_steps+1, -1, 1])
plt.grid(True)

def plot_multiple_forecasts( X,Y,Y_pred):
n_steps = X.shape[1]

plot_series(X[0, :, 0]) # first series in X
plt.plot( np.arange(n_steps, n_steps+ahead), Y[0, :, 0],
"yo-", label="Actual" )
plt.plot( np.arange(n_steps, n_steps+ahead), Y_pred[0, :, 0],
"bx-", label="Forecast", markersize=10 )

plt.legend( fontsize=14 )

plot_multiple_forecasts(X_new, Y_new, Y_pred)

plt.show()

LSTM and GRU cells are one of the main reasons behind the success of RNNs. Yet while they can tackle much longer sequences than simple RNNs, they still have a fairly limited short-term memory, and they have a hard time learning long-term patterns in sequences of 100 time steps or more, such as audio samples, long time series, or long sentences. One way to solve this is to shorten the input sequences, for example using 1D convolutional layers.

Using 1D convolutional layers to process sequences

https://blog.csdn.net/Linli522362242/article/details/115258834

展开全文
• 题目：CNN-SLAM: Realtime dense monocular SLAM with learned depth prediction 出处：Tateno, K., Tombari, F., Laina, I., & Navab, N. (2017). Cnn-slam: Real-time dense monocular slam with learned ...
目录

基本情况

摘要

1 介绍

2 相关工作

SLAM

单视图深度预测

3 提出的单目语义SLAM

3.1 相机位姿估计

3.2 基于CNN的深度预测与语义分割

3.3 关键帧创建和位姿图优化

3.4 逐帧深度细化

3.5 全局模型与语义标签融合

4 评估

4.1 与其他SLAM方法的比较

4.2 纯旋转运动下的精度

4.3 加入三维语义重建

5 结论

参考

基本情况

题目：CNN-SLAM: Real-time dense monocular SLAM with learned depth prediction
出处：Tateno, K., Tombari, F., Laina, I., & Navab, N. (2017). Cnn-slam: Real-time dense monocular slam with learned depth prediction. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 6243-6252).
视频demo: https://www.bilibili.com/video/av49294700/
相关代码：
https://github.com/tum-vision/lsd_slam
https://github.com/raulmur/ORB_SLAM2
https://github.com/uzh-rpg/rpg_open_remode
http://campar.in.tum.de/view/Chair/ProjectInSeg

摘要

随着卷积神经网络（CNN）在深度预测中取得了越来越多的进展，本文探讨了如何将卷积神经网络的深度预测用于高精度稠密单目SLAM系统中。我们提出了一种方法，CNN预测的稠密深度图与从直接单目SLAM获得的深度测量值自然融合在一起。我们的融合方案在单目SLAM方法效果不佳的图像位置进行深度预测，具有优势，例如低纹理区域，反之亦然。我们演示了使用深度预测来估计重建的绝对尺度，克服了单目SLAM的主要局限之一。最后，我们提出了一个框架，该框架可以有效地融合从单个帧获得的语义标签和稠密SLAM，从而从单个视图产生语义一致的场景重构。在两个标准数据集上的评估结果表明了我们方法的鲁棒性和准确性。

图1.提出的单目SLAM方法（a）可以估算出比现有技术（b）好得多的绝对尺度，这对于许多SLAM应用（例如AR）是必需的，骨骼被增强到重建中。 c）我们的方法可以从单一角度？融合3D和语义重建。

1 介绍

SfM和SLAM都是计算机视觉和机器人技术领域非常活跃的研究领域，其目的是实现3D场景重建以及通过3D和成像传感器进行相机姿态估计。最近，与移动深度传感器所获得的深度图融合在一起的实时SLAM方法得到了越来越广泛的应用，因为它们可用于：

从移动机器人到无人机以及从导航到机器人的多种类型的自主导航与建图，
同时也适用于许多增强现实和计算机图形学的应用。
除了导航和建图外，体积融合方法（例如Kinect Fusion [21]以及基于RGB-D数据的稠密SLAM方法[30，11]）还可用于高精度场景重建。
但是，这种方法的主要缺点是深度相机有一些局限性：

尽管不是不可能在室外环境中工作，但的确大多数相机的工作范围有限，并且基于主动感应的相机无法在阳光下工作（或表现不佳），从而使重建和制图的精度降低。
通常，由于深度相机不如彩色相机普遍存在，因此许多研究兴趣集中在单个相机的稠密和半稠密SLAM方法上[22-DTAM-rgbd稠密重建，4-LSD-SLAM，20-ORB-SLAM]。这些方法通过对相邻帧对进行短基线立体匹配来估计当前视点的深度图，从而实现实时单目场景重建。前提假设是，相机会随着时间在空间中平移，因此可以将成对的连续帧视为一个整体。 而匹配通常是通过灰度不变性（直接法）或关键点提取和匹配（特征点法）来进行的。单目SLAM方法的主要局限性是对绝对尺度的估计。实际上，即使相机位姿估计和场景重建准确执行，这种重建的绝对尺度本质上仍然是不精确的，这限制了单目SLAM在增强现实和机器人技术领域的大多数应用场景（如图1中b）。 一些方法提出将场景与一组预定义的3D模型进行匹配并通过目标检测来解决问题，基于估计的目标大小来恢复初始比例[6]，但是在场景中缺少已知尺度的情况下该方法会失败。

单目SLAM的另一个主要局限在于——

在相机纯旋转运动下，由于缺乏立体基线，因此无法使用立体估计，从而导致追踪失败。
最近，出现了一种新的研究方法，该方法通过学习的方法解决了单个图像的深度预测问题。特别是，以端到端的方式使用深度卷积神经网络（CNN）[16、2、3]证明了即使在缺少单目所需条件（纹理，重复图案）的情况下，也能有相对较好的方法和良好的绝对精度，从而使深度图具有回归的潜力，来完成深度估计任务。

深度学习方法的一个优势是，

可以从例子中学习绝对尺度，从而可以从单个图像进行预测，而无需基于场景的假设或几何约束，这与[10，18，1]不同。
这种深度学习的主要局限性在于，

尽管全局准确，但深度边界往往会局部模糊：因此，如果像[16]中那样将深度融合在一起进行场景重建，则重建的场景将总体上缺少形状细节。
与此相关的是，尽管提出了几种用于单视图深度预测的方法，但迄今为止，深度预测在更高级别的计算机视觉任务中却被大多数人所忽视，文献[16]中给出了几个例子。本文工作的主要思想是，从两种方法中汲取优点，并提出一种单目SLAM方法，

该方法将深度网络和直接SLAM计算的单目深度估计融合在一起，进行深度预测，从而进行稠密场景重建，同时具有精确的绝对尺度和鲁棒的追踪能力。
为了恢复模糊的深度边界，将CNN预测的深度图作为稠密重建的初始估计，并通过类似于[4]（LSD-SLAM）中的短基线立体匹配的直接SLAM方案逐步完善。
重要的是，边缘区域往往很模糊，而

短基线立体匹配则具有在预测深度图像上细化边缘区域的能力。
同时，从CNN预测的深度图获得的初始估计可以提供绝对尺度信息来进行位姿估计，因此在绝对尺度方面，与单纯的单目SLAM相比，CNN估计的位姿轨迹和场景重建更加精确。图1的a展示了一个例子，该示例说明了使用本文中提出的精确绝对尺度方法进行场景重建的有效性。此外，由于CNN预测的深度不会受到上述纯旋转问题的影响，因此可以使追踪更加鲁棒，因为它是在单个帧上单独进行估计的。

再者，由于可以在同一架构的不同计算资源（GPU和CPU）上

同时执行CNN的深度预测和
深度细化这两个过程，
因此该框架可以实时运行。

近期关于CNN的另一个研究方面是，相同的网络结构可以用于不同的高维回归任务，而不仅仅是深度估计：一个典型的例子是语义分割[3，29]。我们根据这一点作为我们框架的扩展，该框架使用像素级标签来将语义标签与稠密SLAM有效地融合在一起，从而从单个视图获得语义连贯的场景重建（如图1所示）。值得注意的是，就我们所知，语义重建仅在近期才有所发展，并且在单目情况下仅基于立体[28]或RGB-D数据[15]的方法还从未提出过。

我们使用两个针对单目SLAM和深度预测的公开数据集来评估我们的方法，重点是位姿估计和重构的精确性。由于CNN预测的深度取决于训练过程，因此我们展示的实验中，训练集是从与评估基准中完全不同的环境和RGB传感器中选取的，以证实我们的方法（特别是实用性）可用于新环境中。我们也展示了在真实环境中联合场景重建和语义标签融合的测试结果。

2 相关工作

在本节中，我们回顾了我们在框架内集成的两个领域（即SLAM和深度预测）的相关工作。

SLAM

关于SLAM有大量文献。 从要处理的输入数据类型的角度来看，方法可以分为

基于深度相机的[21、30、11]和
基于单目相机的[22-DTAM-rgbd稠密重建，4-LSD-SLAM，20-ORB-SLAM]。
相反，从方法论的角度来看，它们被分类为

基于特征的特征点法[12-PTAM、13、20]，
和直接法[22、5-Semi-dense visual odometry for a monocular camera、4]。在本文中，
我们将集中讨论单目SLAM方法。

对于基于特征点法的单目SLAM，ORB-SLAM [20]是高精度位姿估计方面的最新成果。该方法从输入图像中提取稀疏ORB特征，以进行场景的稀疏重构以及估计相机姿态，还采用了局部BA优化和图优化。
对于直接单目SLAM，[22]的稠密追踪和建图（DTAM）通过使用短基线多视图立体匹配和正则化方法在GPU上实时进行稠密重构，因此在在图像中的低纹理区域的深度估计更加平滑。
此外，大规模直接SLAM（LSD-SLAM）算法[4]提出了使用半稠密地图表示的方法，该表示仅在输入图像的梯度区域上追踪深度值，这就有足够的效率在CPU上来实现实时直接SLAM。
LSD-SLAM的扩展是recent Multi-level mapping（MLM）算法[7]，该算法提出了在LSD-SLAM的基础上使用稠密方法以增加其密度并提高重建精度。

单视图深度预测

深度学习取得了越来越多的进展，从单一视图进行深度预测已在计算机视觉领域引起了越来越多的关注。

经典的深度预测方法采用手工选取的特征和概率图形模型[10，18]来生成规则化的深度图，通常对场景的几何形状有很强的假设性。
在深度估计精度方面，最近开发的深度卷积结构明显优于以前的方法[16、2、3、29、19、17]。有趣的是，[16]的工作（Deeper depth prediction with fully convolutional residual networks - S2D.16）展示了将深度预测作为一个模块用于稠密SLAM。特别的，预测的深度图被Keller用作基于点的融合RGB-D SLAM算法的输入[11- S2D.46]，这表明虽然由于CNN的压缩，使投影模糊与空间信息丢失，从而使导致形状细节缺失，但基于SLAM的场景重建可以通过深度预测获得。
3 提出的单目语义SLAM

在这一部分中，我们将阐述所提出的三维重建框架，其中CNN预测的稠密深度图与从直接单目SLAM获得的深度测量数据相融合。此外，我们还展示了CNN预测的语义分割如何与全局重建模型相融合。图2中的流程图描述了框架概况。我们采用基于关键帧的SLAM[12，4，20]，特别是我们使用[4] S2D.24_LSD-SLAM 中提出的直接半稠密方法作为基准。在这种方法中，视觉清晰帧的子集被收集为关键帧，其位姿基于图优化进行全局优化。同时，通过估计帧与其最近的关键帧之间的变换，在每个输入帧处进行相机姿态估计。

为了保持较高的帧率，我们仅在关键帧上通过CNN预测深度图。特别是如果当前估计的位姿与现有关键帧的位姿相差很远，则从当前帧创建新的关键帧，并通过CNN估计其深度。此外，通过测量每个深度预测的像素置信度来构造不确定度图。由于在大多数情况下，用于SLAM的摄像机不同于用于获取CNN训练数据集的摄像机，因此我们提出了一种特定的深度图归一化过程，以形成对不同摄像机内参的鲁棒性。另外，在进行语义标签融合时，我们使用第二个卷积网络来预测输入帧的语义分割。最后，在关键帧上创建一个位姿图，以便全局优化它们的相对位姿。

该框架的一个特别重要的阶段，也代表了我们的文章的一个主要贡献，是

通过短基线立体匹配，通过优化关键帧和相关输入帧之间的灰度一致性最小化，来优化与每个关键帧相关联的CNN预测深度图。
特别是，深度值将主要围绕具有梯度的图像区域进行优化，即在极线匹配可以改善精度的区域。这将在第3.3和3.4小节中具体描述。相关地，深度值的优化是由与每个深度值相关联的不确定性所决定的，该不确定性是根据一个特殊的置信度估计的（见下文3.3）。框架的每个阶段都会在下面的小节中详细介绍。

3.1 相机位姿估计

相机位姿估计方法是受[4]中关键帧方法启发的。系统有一组关键帧k1…kn∈K作为进行SLAM重建的结构元素。每个关键帧ki与关键帧位姿Tki、深度图Dki和深度不确定度图Uki相关联。与[4]相比，我们的深度图是稠密的，因为它是通过基于CNN的深度预测生成的（见第3.2节）。不确定度图表示每个深度值的置信度。与[4]将不确定度初始化为一个很大的恒定值不同，我们的方法根据深度预测的测量置信度来初始化不确定度（见3.3节）。在下面，我们将把一个通用的深度映射元素记为u=（x，y），它在图像域中有固定范围u∈Ω⊂（R2为2维欧氏空间），是它的齐次表示。

在每一帧t上，我们的目标是估计当前相机的位姿，即最近的关键帧ki和帧t之间的转换，由3×3的旋转矩阵Rt∈SO(3)和3D平移向量tt∈组成。基于目标函数的加权高斯-牛顿优化方法，通过最小化当前帧的强度图像It和最近关键帧ki的强度图像之间的光度残差来估计该变换。目标函数如下：

其中，ρ是Huber范数：
σ是测量残差不确定度的函数[4]。这里，r是光度残差，定义为：
考虑到我们的深度图是稠密的，为了提高效率，我们将光度残差的计算限制在高灰度梯度区域内的像素子集上，该像素子集由图像域子集u˜ ⊂ u∈ Ω定义。另外，在（2）中，π表示将3D点映射到2D图像坐标的透视投影函数（归一化相机坐标）。
而Vki（u）表示从关键帧的深度图计算得到的三角测量的三维元素（图像点换算到相机坐标系下的空间点的三维坐标）
其中K是相机内部矩阵。
获得后，世界坐标系中的当前相机姿态计算为。

3.2 基于CNN的深度预测与语义分割

每次创建新的关键帧时，都会通过CNN预测相关的深度图。我们采用的深度预测结构是基于[16]中提出的最新方法，它将残差网络（ResNet）[9]扩展为全卷积网络。具体来说，

架构的第一部分基于ResNet-50[9]，并使用图像网络上的预先训练好的权重初始化[24]。
该体系结构的第二部分用一系列由非池化层和卷积层组合而成的残差上采样块，取代了ResNet-50中提出的最后一个池化层和全连接层。
在上采样之后，是drop-out层，最终卷积层输出表示预测深度图的单通道输出图。
损失函数基于反向Huber函数[16]。

遵循其他方法的成功典范，在深度预测和语义分割任务中采用相同的架构[3，29]，我们还重新训练了该网络，以从RGB图像中预测像素级语义标签。为了解决这一问题，我们对网络进行了改进，使其具有与类别数量相同的输出通道，并采用softmax层和交叉熵损失函数，通过反向传播和随机梯度下降（SGD）来最小化。必须指出的是，虽然原则上可以使用任何语义分割算法，但这项工作的主要目标是展示如何在我们的单目SLAM框架内成功融合逐帧级分割地图（见3.5）。

3.3 关键帧创建和位姿图优化

使用预训练的CNN进行深度预测的一个局限是，如果用于SLAM的传感器和用于获取训练集的传感器内参不同，则3D重建的最终绝对尺度将不准确。 为了改善这个问题，我们使用当前摄像机的焦距和用于训练的传感器焦距之比来调整CNN回归的深度。
其中是由CNN直接从当前关键帧图像Ii回归的深度图。

图3展示了在ICLNUIM数据集[8]上进行过程的调整，效果非常显著（5）（比较（a）和（b））。如图所示，与CNN直接预测的深度图相比，调整后的性能有了显著的提高。改进后的位姿追踪精度和深度精度都得到了提高。
图3 （a）直接CNN深度预测 （b）深度调整后和 （c）深度调整和细化后（A）位姿轨迹精度 （B）深度估计精度蓝色像素表示正确估计的深度，即误差在真实数据的10%以内。这是在ICL-NUIM数据集上进行比较的[8]。

此外，我们将每个深度图Dki与不确定性图Uki相关联。在[4]中，通过将每个元素设置为一个大的常量值来初始化这个图。由于CNN在每一帧上都是稠密图，但不依赖于任何时空正则化，因此我们提出通过基于当前深度图与在最近关键帧上的场景点之间的差异，来计算置信值，从而初始化我们的不确定性图。因此，这种置信度表示了每个预测深度值在不同帧之间的差异性：对于与高置信度相关联的那些元素，连续的求精过程将比[4]中的过程快并且有效得多。

具体地说，不确定性图Uki定义：为当前关键帧ki的深度图与最近关键帧kj的深度图之间的元素平方差，ki到kj的估计变换值为.
为了进一步提高每个新的初始化关键帧的精度，我们在使用新的输入帧（深度细化过程在第3.4小节中描述）对其深度图和不确定性图进行细化后，将其与从最近的关键帧（这显然不适用于第一个关键帧）传播的深度图和不确定性图进行融合。为了达到这个目的，我们首先定义了一个从最近的关键帧kj传播的不确定性图为：
其中 ，根据[4]，是用于增加传播不确定度的白噪声方差（白噪声（white noise）是指功率谱密度在整个频域内是常数的噪声。 所有频率具有相同能量密度的随机噪声称为白噪声）。然后，将两个深度图和不确定性图按照加权方案融合在一起。

最后，位姿图也在每个新的关键帧处更新，方法是使用图中已经存在的关键帧创建新边，这些关键帧与新添加的关键帧有着相似的区域（即具有较小的相对位姿）。此外，关键帧的位姿每次都通过位姿图优化进行全局优化[14]。

3.4 逐帧深度细化

此阶段的目的是基于在每个新帧处估计的深度图，连续地细化当前关键帧的深度图。为了实现这一目标，我们使用了文献[5]（Semi-dense visual odometry for a monocular camera）中描述的短基线立体匹配策略：在当前帧t的每个像素处计算深度图Dt，和沿极线的5个像素匹配的不确定性图Ut，这两个图通过相机姿态与关键帧进行对齐。

然后，将估计的深度图和不确定度图，直接与最近的关键帧ki的深度图和不确定度图融合，如下所示：
重要的是，由于提出了基于CNN的预测，关键帧与稠密深度图相关联，因此可以稠密地执行此过程，即关键帧的每个元素都被细化，而不是像[5]一样仅沿高梯度区域细化深度值。由于低纹理区域内的观测深度往往具有很高的不确定性（即Ut值较高），因此，所提出的方法自然会产生一个精细的深度图，其中靠近高强度梯度的元素将通过在每个帧处估计的深度进行精细化，而大量的低纹理区域内的元素将逐渐保持CNN的预测深度值，而不受不确定深度观测的影响。

总结：如何优化当前关键帧ki的深度？（关键帧kj -> 关键帧ki -> 新的帧t）

第0步，计算新的帧的Dt Ut 基于[5](短基线立体匹配)
第1步，使用上述Dt Ut 更新当前关键帧ki的深度图和不确定性图（公式10,11）
第2步，使用ki最近的关键帧kj传播过来的不确定图，更新ki的深度图和不确定性图（公式8，9）
图3展示了在ICL-NUIM数据集上进行深度图细化过程的高效[8]。图（c）表示，对深度图进行调整和深度细化后获得的性能，在深度估计和位姿追踪方面，相对于以前的情况有了显著改进。

3.5 全局模型与语义标签融合

上述过程得到的关键帧可以融合在一起，生成重建场景的三维全局模型。由于CNN被训练为除了深度图之外，还可以提供语义标签，通过一个我们称为语义标签融合的过程，语义信息还可以与3D全局模型的每个元素相关联。
在我们的框架中，采用了文献[27]（Real-time and scalable incremental segmentation on dense slam）中提出的实时方案，其目的是将从RGB-D序列的每个帧获得的深度图，和连通分量图逐步融合在一起。该方法使用Global Segmentation Model（GSM）来平均每个3D元素随时间的标签分配，从而对逐帧分割中的噪声具有鲁棒性。在我们的例子中，位姿估计是作为算法的输入提供的，因为相机位姿是通过单目SLAM估计的，而输入深度图是那些只与关键帧相关联的。这里，我们使用语义分割图，而不是[27]中的连通分量图。场景的三维重建是在新的关键帧上逐步构建，其中每个三维元素都与用于训练CNN的集合中的语义类相关联。

4 评估

我们做了一个实验评估，以验证我们的方法在跟踪和重建精度方面的效果，方法是通过与两个公共数据集进行定量比较（4.1）。以及对纯旋转摄像机运动鲁棒性的定性评估（4.2）和语义标签融合的准确性（4.3）。

实验使用台式PC，该PC配置为：Intel Xeon CPU 2.4GHz，RAM16GB，Nvidia Quadro K5200 GPU 8GB。至于我们方法的实现，尽管CNN网络的输入/输出分辨率为304×228[16]，但是输入帧和预测深度图都转换为320×240作为所有其他阶段的输入。此外，CNN深度预测和语义分割在GPU上运行，而所有其他阶段都在CPU上实现，并在两个不同的CPU线程上运行，一个线程专门用于帧处理阶段（相机位姿估计和深度优化），另一个线程执行与帧相关的处理阶段（关键帧初始化，位姿图优化和全局地图与语义标签融合），使我们的整个框架能够实时运行。

我们使用两个公共数据集，即ICL-NUIM数据集[8]和TUM RGB-D SLAM数据集[26]，前者是合成的，后者是用Kinect传感器获取的。这两个数据集都提供了相机轨迹和深度图方面的真实数据。在我们的所有实验中，我们使用了NYU Depth v2数据集的室内序列上训练的CNN模型[25]，来测试网络对未知环境的泛化能力；该数据集既包括深度真实数据（由使用Microsoft Kinect相机获取的深度图表示），也包括语义标签融合所必需的像素级语义标签标注。特别的，正如[16]所述，我们正式训练语义分割网络是在标记子集上，而深度网络则使用NYU Depth v2数据集的更多帧进行训练。语义标注由4个超类组成：楼层、垂直结构、大结构/家具、小结构。值得注意的是，训练数据集的设置与我们评估方法的设置有很大不同，因为它们包含不同的摄像机传感器、视点和场景布局。例如，NYU Depth v2包括许多客厅、厨房和卧室，这些都是TUM RGB-D SLAM中缺少的，TUM RGB-D SLAM主要是在有桌子、物体和人的办公室。

4.1 与其他SLAM方法的比较

我们将我们的方法与LSD-SLAM[4]和ORB-SLAM[20]这两种能公开实现的方法进行比较，这两种最新的方法分别代表了直接法和特征点法。为了使评估更加完整，我们还与REMODE[23]进行了比较，REMODE[23]是一种专注于稠密的单目深度图估计的最先进的方法。REMODE的实现取自作者的代码。最后，我们还将我们的方法与文献[16]中的方法进行了比较，[16]中的方法基于文献[27]作者提供的可行实现，使用CNN预测的深度图作为最新基于深度的SLAM方法的输入（融合了[11，27]）。考虑到单目SLAM方法估计绝对尺度的模糊性，如[4，20]中的评估所示，我们还通过使用真值深度图bootstrapping(???不会翻)其初始尺度来评估LSD-SLAM。对于REMODE，由于需要在每一帧进行相机位姿估计作为输入，所以我们使用LSD-SLAM估计的轨迹和关键帧with bootstrapping(???同不会翻)。

表1 ICL-NUIM和TUM数据集上绝对轨迹误差[m]和正确估计深度百分比的比较（TUM/seq1: fr3/long office household, TUM/seq2: fr3/nostructure texture near withloop, TUM/seq3: fr3/structure texture far.

图4 在ICL-NUIM数据集的（office2）序列上比较深度图精度和密度[8]，从左起依次为真实数据、我们的方法得到的优化关键帧、来自CNN的相应原始深度预测、来自LSD-SLAM的优化关键帧[4]与bootstrapping和REMODE的估计密集深度图[23]。精度值表示在此关键帧上正确估计的深度密度。

按照[26]中提出的评估方法，表1给出了基于绝对轨迹误差（ATE）的相机位姿精度，该绝对轨迹误差是根据每个评估序列的估计相机平移和真值相机平移之间的均方根误差计算的。此外，我们还评估了重建精度和密度，通过评估深度值的百分比，其与相应的真实深度的差异小于10%。根据表中的观测值，因为单目SLAM的绝对尺度模糊性，我们的方法总是能够得到比单目方法更高的位姿轨迹精度。有趣的是，即使在应用bootstrapping（？？？）后，我们的方法的位姿精度平均也高于LSD-SLAM，这意味着所提出的深度融合方法的内在有效性，而不仅仅是估计正确的比例数据。在重建方面也有同样的优势，因为估计的关键帧不仅比LSD-SLAM和ORB-SLAM的更精确，而且密度更高。此外，与文献[16]中–CNN预测深度在不细化的情况下用作SLAM的输入–的方法相比，我们的方法在位姿和重建精度方面也有更好的表现，这再次证明了所提出的优化模糊边缘和CNN预测错误估计深度方案的有效性。最后，在深度图精度方面，我们明显优于REMODE。

图4还展示了精度依次增加深度图——分别是CNN（如[16]）和REMODE以及更高密度的LSDSLAM。图中比较了真实情况、使用我们的方法的改进的关键帧、CNN的相应原始深度预测、使用bootstrapping的LSD-SLAM[4]的改进的关键帧，以及ICL-NUIM数据集序列上REMODE的估计密集深度图。与LSD-SLAM相比，我们的方法不仅有更高的密度，而且有助于大幅减少CNN预测的模糊伪影，提高整体深度精度。此外，我们可以注意到REMODE往往会在低纹理区域失败，而我们的方法相反，我们的方法可以通过利用CNN预测的深度值来估计这些区域上的稠密深度。

4.2 纯旋转运动下的精度

前面说过，与标准单目SLAM相比，我们方法的一个优点是，在纯旋转运动下，重建仍然可以依靠CNN预测的深度进行，而其他方法由于连续帧之间没有立体基线而失败。为了描述这一优点，我们对来自TUM数据集的（fr1/rpy）序列（主要由旋转摄像机运动组成）进行了评估。如图5所示，分别为我们的方法、LSD-SLAM获得的重建结果两者和真实情况的比较。可以看出，我们的方法可以重建场景结构，即使摄像机的运动是纯旋转的，而LSD-SLAM的结果是明显的噪声，因为估计深度所需的立体基线对于大多数帧是不够的。我们也尝试了ORB-SLAM，但失败了，因为缺乏必要的基线来初始化算法。

4.3 加入三维语义重建

最后，给出了该方法结合三维和语义重建的一些结果。图6中给出了三个示例，其中展示了使用我们自己的序列和从NYU Depth V2数据集的两个序列[25]获得的办公室场景重建。ICL-NUIM数据集的序列living0的另一个示例如图1 c所示。图中还以绿色显示了估计的相机轨迹。据我们所知，这是第一次用单目相机进行联合三维和语义重建的实验。给出的结果包括位姿和重建质量以及语义标签融合三个方面。

5 结论

我们已经展示了如何通过深度神经网络将SLAM与深度预测相结合，这是解决传统单目重建固有局限性的一个有前途的方向，特别是在估计绝对尺度、在低纹理区域获得稠密深度以及处理纯旋转运动方面。提出的基于短基线立体匹配的CNN预测深度图优化方法，在保持单目SLAM在摄像机平移和高图像梯度情况下的鲁棒性和准确性的同时，自然地克服了这些问题。整体框架能够在融合语义分割标签和全局3D模型的同时重建场景，为单目相机的场景理解开辟了新的视角。一个未来的研究方向是用深度预测来闭合回路，即通过几何上精细化的深度图来改进深度估计。

参考

论文翻译（上）：CNN-SLAM_ Real-Time Dense Monocular SLAM With Learned Depth Prediction

展开全文
• 砌体表面裂缝检测 生成此GitHub存储库是为了... D.戴斯（D.Dais），İ.E。 Bal，E。Smyrou，V。Sarhosis，使用卷积神经网络和传递学习在砖石表面上自动进行裂缝分类和分段，《建筑自动化》。 125（2021），第103606页。...
• CNN_classification_feature_extraction 该存储库是pytorch中用于分类和特征提取的CNN的实现。...'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2', 'wide_resnet101_2', 'vgg11', 'vgg11_bn', 'vgg13'
• ## CNN_LSTM

千次阅读 2017-07-09 19:00:08
使用LSTM代替CNN中的全连接层 import tensorflow as tf import pandas as pd import numpy as np from sklearn.metrics import confusion_matrix from tensorflow.python.ops import rnn,rnn_cell import ...
• 1 cargo run --example cnn_mnist </code></pre> <p>which results in the following error. I've compiled it both with and without the <code>--features mkl</code> flag, and with and without the <code>...
• /mnt/pankajd/sequence-to-sequence/s2s_out_cnn_cnn_53_50M_sampled/params.00001" [INFO:sockeye.utils] GPU 2: 3370/11439 MB (29.46%) GPU 3: 3370/11439 MB (29.46%) GPU 4: 3370/11439 MB (29.46%) GPU 5:...
• #用CNN进行图片分类D:\ST\Python_work\program\AI-CNN-Tensorflow-master #D:\ST\Python_work\program\AI-CNN-Tensorflow-master# 14*256张汉字图片，186行代码，一个代码，文件，bmp图片格式输入 ''' 只有一个文件...
• 在看到CVPR2019 论文 Libra R-CNN时发现这篇论文对一些训练过程中的不平衡进行了处理，从而获得了较高的准确率。 如何选择具有代表性的region proposal（这个问题很多论文都探讨过，可以认为是样本（proposal）的不...
• <div><p>Phần cần dịch: https://github.com/aivivn/d2l-vn/blame/master/chapter_natural-language-processing-applications/sentiment-analysis-cnn_vn.md#L185-L254</p> <h3>Hướng dẫn <ul><li>...
• D.孟格，许Z.许和J.佩斯利， 此代码的另一个版本在[]中 要求 张量流0.12.1 如何使用代码 步骤1：运行python代码python cnn_train.py 第2步：运行Matlab代码Demo_Post_MRF.m 引用 该代码受MIT许可证保护。 如果您在...
• <div><h2>test16:~/bonnet/train_py\$ ./cnn_train.py -d cfg/cityscapes/data.yaml -n cfg/cityscapes/net_bonnet.yaml -t cfg/cityscapes/train_bonnet.yaml -l ../log/ -p ../pretrained</h2> <p>INTERFACE: data ...
• #ifndef _D_CNN_ #define _D_CNN_ #include&lt;Eigen&gt; #include&lt;opencv.hpp&gt; #include&lt;vector&gt; #include&lt;unordered_map&gt; #include&lt;ctime&gt; ...
• https://gist.github.com/zealseeker/c05b88527cc1ceda7525c2889e8ab47f</p> <p>The "cnn_graph" model is stable, the performance is reproducible (about 0.69 R2) while the "tensorgraph" ...
• 摘要 论文梳理 整体框架的组成为三个部分： A real-time SLAM system ElasticFusion A Convolutional Neural Network A Bayesian update scheme 其中过程如下： CNN接收2D图像并且返回每个像素的分类概率分布，然后...
• bilstm_cnn_crf.py:59: UserWarning: Update your <code>Conv1D</code> call to the Keras 2 API: <code>Conv1D(padding="valid", kernel_size=5, filters=50) conv=Conv1D(nb_filter=...
• # -*- coding: utf-8 -*-"""Created on Sat May 26 16:40:17 2018...import tensorflow as tffrom tensorflow.examples.tutorials.mnist import input_datamnist=input_data.read_data_sets('MNIST_d...
• <code>D:\Tensorflow\cnn_lstm_ctc_ocr-master\src>python validate.py d:/Tensorflow/cnn_lstm_ctc_ocr_master/src/11.jpg d:\Anaconda3\lib\site-packages\h5py\__init__.py:34: FutureWarning: Conversion of ...
• and the code <a href="https://github.com/tensorflow/benchmarks/blob/82dd0539c76afa8491e50d8f796e686b4d97b988/scripts/tf_cnn_benchmarks/preprocessing.py#L23">here</a></p>该提问来源于开源项目：...
• 适用于SUN RGB-D的更快的R-CNN 适用于SUN RGB-D的更快的R-CNN。 此实现基于ChainerCV。
• AoT_TCAM CVPR 18论文中用于时间预测箭头的火炬模型 D. Wei，J。Lim，A。Zisserman，W。Freeman。 “学习和使用时间之箭。” 在CVPR 2018中。... 在UCF101上从零开始...D_TXT=data/@01_cnn_ta_flow_orig_fb.txt E_SAV
• 轴承matlab代码滚动轴承故障检测，不使用CNN 概述 这段代码是bla bla bla bla 依存关系 keras tidyverse 剧本 data_import.R ：用于从数据集中读取matlab文件并将其转换为整齐CSV的文件 数据集 该数据集来自Case ...
• python main.py --gan_type wgan --net_type cnn --clip 0.05 --D_iter 5 --epoch 10 执行run.sh训练所有模型并生成图像。 ./run.sh 清洁一个模型的无花果，检查点和日志。 python main.py --todo clear --gan_...

...