TBD
RNN和Attention模型和数学公式
References
- 简单的Seq2Seq实现作对联: 详细的生成对联教程
feature types in dataset 参见Tensorflow高阶读写教程
tf.data共有如下几种feature types
context_features = {
"age": tf.FixedLenFeature([],dtype=tf.int64), #长度固定为1(dim=0)的单值特征
"movie_rating": tf.FixedLenFeature(shape=[3], dtype=tf.int64))}#shape=[3]的序列特征, 如[1, 2, 1]
sequence_features = {
"item_id": tf.FixedLenSequenceFeature([], dtype=tf.int64, allow_missing=True)
#shape=[None]的序列特征, 必须加allow_missing, 如[1, 2, 2, ...]
"movie_rating": tf.FixedLenSequenceFeature([3], dtype=tf.int64, allow_missing=True)
#shape=[None, 3](axis=0上长度不定, axis=1上长度固定为3)的序列特征, 必须加allow_missing
#如[[1, 2, 1], [2, 1, 1], [1, 1, 2], ...]
"click_ids": tf.VarLenFeature(dtype=tf.int64)} #每个维度上都是变长度的序列特征
#如[[1], [1, 2], [1, 2, 3]]
parse_example() 共有如下几类
tf.parse_example()
tf.parse_single_example()
: 返回一个dict,如{"age": 18, "gender": 1}
tf.parse_single_sequence_example()
: 返回两个dict,第一个是context dict(单值特征);同上,第二个是feature_list dict(序列特征), 如{"rates": [1, 3, 2], ...}
详细介绍见Tensorflow高阶读写教程
fixed_size_partitioner() 在tf的ps(parameter server)架构中,ps负责存储模型的参数,worker负责使用训练数据对参数进行更新。默认情况,tf会把参数按照round-robin的方式放到各个ps上。使用tf.fixed_size_partitioner()
搭配tf.variable_scope()
可以将参数在指定维度(axis)分割成指定份数(num_shards),详见Tensorflow参数分割。
partitioner = tf.fixed_size_partitioner(num_of_ps, 0)
with tf.variable_scope("conv1", partitioner=partitioner):
W_conv1 = weight_variable([5, 5, 1, 32], "conv1")
padded_batch() 可以把一个batch中不同长度的样本对齐,更多参见TFRecordDataset变长数据的batch读取
feature_map = {'sequence': tf.VarLenFeature(dtype=tf.float32),
'sequence_len': tf.FixedLenFeature(shape=[], dtype=tf.int32),
'label': tf.FixedLenFeature(shape=[], dtype=tf.int32)}
dataset = dataset.padded_batch(batch_size=10,
padded_shapes={'sequence': [], 'sequence_len': [], 'label': [None]})
#[]代表单值特征的shape, [None]代表变长序列特征的shape, 可能需要list而不是dict
dataset = dataset.padded_batch(batch_size=10,
padded_shapes=[[], [], [None]],
padding_values=None)
#也可以写成padded_shapes=list, 但是可读性差
参数padded_shapes
指明dataset每条记录(每个特征)中各成员要pad成的形状:
- 若单条记录是一个scalar(如点击label:
1
),则必须用[]
(即[]
里不可填写内容),等价于没有pad; - 若单条记录是list(如点击序列:
[item1, item2, item3]
),用[max_length]
可以把每一行pad到max_length
的长度,用[None]
可以把每一行pad到这个batch中最长样本的长度, 即[]
里必须填写内容; - 若是
array
,则用[d1,...,dn]
,这里有一个多维的例子
tf.contrib.data.group_by_window() 考虑上文中的特征, 即数据集中的特征按顺序为sequence
、sequence_len
和label
。
key_func()
为每一行分配一个bucket ID,相同ID的行分配到一个window里,这里key_func()
以每一行样本作为输入,输出为bucket ID;reduce_func
的作用是设定一个输出规则,该规则只是预先定义,在真正提取数据时候才会执行。该规则是对于同一个bucket中的最多window_size数量的数据做一个处理,然后将处理后的数据(官网叫another dataset)进行输出。(下面的例子演示的是padding,所以如果group_by_window时对sequence data做padding,前面就不要做了,否则会重复padding)window_size
一般就是用batch_size
就好了,更多参见tensorflow nmt 的数据预处理过程
def key_func(unused_key1, sequence_len, unused_key2): #unused_key1指sequence, unused_key2指label
if max_length > 1:
bucket_width = (max_length + num_buckets - 1) // num_buckets
else:
bucket_width = 10
bucket_id = tf.maximum(sequence_len // bucket_width, tgt_len // bucket_width)
return tf.to_int64(tf.minimum(num_buckets, bucket_id))
def reduce_func(unused_key, windowed_data): #unused_key是指该bucket中的数据对应的bucket ID
return windowed_data.padded_batch(
batch_size=10,
padded_shapes={'sequence': [], 'sequence_len': [], 'label': [None]})
dataset = dataset.apply(tf.contrib.data.group_by_window(key_func=key_func,
reduce_func=reduce_func, window_size=batch_size))
tf.nn.in_top_k() 在多分类问题下,计算每一个样本的target是否在predictions的前k
个
predictions = [[1, 2, 5], [2, 4, 1], [2, 5, 1]] #必须rank=2
targets = [0, 2, 2] #必须rank=1
tf.nn.in_top_k(predictions, targets, k=2)
# 返回boolean数组 [False, True, False]
tf.dynamic_rnn 参考这里、这里和这里
tf.metrics.recall_at_top_k()
tf.train.MonitoredSession() 参考这里
tf.app.run() 参考这里
使用tfrecord打包不定长的序列数据
filename = "data/test.tfrecord"
labels = [1, 2, 3, 4, 5]
frames = [[1], [2, 2], [3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5, 5]]
with tf.python_io.TFRecordWriter(filename) as writer:
for i in range(len(labels)):
label, frame = labels[i], frames[i]
label_feature = tf.train.Feature( #int64单值特征
int64_list=tf.train.Int64List(value=[label]))
frame_feature_list = tf.train.FeatureList( #int64序列特征, 每个element都是单值
feature=[tf.train.Feature(int64_list=tf.train.Int64List(value=[i])) for i in frame])
seq_example = tf.train.SequenceExample(
context=tf.train.Features(feature={"label": label_feature}),
feature_lists=tf.train.FeatureLists(feature_list={"frame": frame_feature_list}))
serialized = seq_example.SerializeToString()
writer.write(serialized)
#如果用with, 则不需要writer.close()
reader = tf.TFRecordReader()
dataset = tf.data.TFRecordDataset('data/test.tfrecord')
def _parse(serialized_example):
context_features = {"label": tf.FixedLenFeature([], dtype=tf.int64)}
sequence_features = {"frame": tf.FixedLenSequenceFeature([], dtype=tf.int64)}
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=serialized_example,
context_features=context_features,
sequence_features=sequence_features)
#根据parse_single_sequence_example的文档, FixedLenFeature, FixedLenSequenceFeature
#都是解析为tensor, 而VarLenFeature会被解析为sparse tensor, 需要做一个sparse_to_dense转换
#tf.sparse_tensor_to_dense()
return context_parsed["label"], sequence_parsed["frame"]
dataset = dataset.map(_parse)
dataset = dataset.padded_batch(5, padded_shapes=([], [None])) #padded_shapes和_parse的返回对应
#对于含有变长序列数据, 必须通过padded_batch函数来batch, 使用dataset.batch会报错
iterator = dataset.make_one_shot_iterator()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(10):
print(sess.run(iterator.get_next()))
卷积和池化
多进程
nce loss和sampled softmax loss
tf.nn.nce_loss
multi-head self attention
def self_multihead_attn(inputs, num_units, num_heads, key_masks, dropout_rate, is_training):
if num_units is None:
num_units = inputs.get_shape().as_list[-1]
length = tf.reshape(tf.shape(inputs)[1], [-1])
Q_K_V = tf.layers.dense(inputs, 3 * num_units, tf.nn.relu) # [batch_size, seq_len, 3 * num_units]
Q, K, V = tf.split(Q_K_V, 3, -1) # both Q, K ,V is [batch_size, seq_len, num_units]
Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # [num_heads * batch_size, seq_len, num_units / num_heads]
K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)
V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)
align = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # [num_heads * batch_size, seq_len, seq_len)
align = align / (K_.get_shape().as_list()[-1] ** 0.5) # scale
paddings = tf.fill(tf.shape(align), float('-inf')) # exp(-large) -> 0
key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(inputs)[1], 1]) # (h*N, T_q, T_k)
align = tf.where(key_masks, align, paddings) # (h*N, T_q, T_k)
# Future Binding
lower_tri = tf.ones(tf.concat([length, length], axis=0)) # (T_q, T_k)
lower_tri = tf.contrib.linalg.LinearOperatorTriL(lower_tri).to_dense() # (T_q, T_k)
masks = tf.tile(tf.expand_dims(lower_tri, 0), [tf.shape(align)[0], 1, 1]) # (h*N, T_q, T_k)
align = tf.where(tf.equal(masks, 0), paddings, align) # (h*N, T_q, T_k)
# Softmax
align = tf.nn.softmax(align) # [num_heads * batch_size, seq_len, seq_len)
tf.summary.histogram("attn_weights_dist", tf.trace(align) / tf.reduce_sum(align, axis=(1, 2)))
tf.summary.scalar("attn_weights_sum", tf.reduce_sum(tf.trace(align)) / tf.reduce_sum(align))
align = tf.layers.dropout(align, dropout_rate, training=is_training) # (h*N, T_q, T_k)
# Weighted sum
outputs = tf.matmul(align, V_) # (h*N, T_q, C/h)
# Restore shape
outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # [batch_size, seq_len, num_units]
# Residual connection
outputs += inputs # (N, T_q, C)
# Normalize
outputs = layer_norm(outputs) # (N, T_q, C)
return outputs
Layer Norm
def layer_norm_compute(x, epsilon, scale, bias):
"""Layere norm raw computation"""
mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
var = tf.reduce_mean(tf.sequare(x - mean), axis=[-1], keepdims=True)
norm_x = (x - mean) * tf.rsqrt(var + epsilon)
return norm_x * scale + bias
Last updated on Jun 30, 2019.