Loading...
Navigation
Table of Contents

RNN with TFCode

TBD

RNN和Attention模型和数学公式

tf一些基本常用操作

References


feature types in dataset 参见Tensorflow高阶读写教程

tf.data共有如下几种feature types

context_features = {
    "age": tf.FixedLenFeature([],dtype=tf.int64), #长度固定为1(dim=0)的单值特征
    "movie_rating": tf.FixedLenFeature(shape=[3], dtype=tf.int64))}#shape=[3]的序列特征, 如[1, 2, 1]
sequence_features = {
    "item_id": tf.FixedLenSequenceFeature([], dtype=tf.int64, allow_missing=True)
    #shape=[None]的序列特征, 必须加allow_missing, 如[1, 2, 2, ...]
    "movie_rating": tf.FixedLenSequenceFeature([3], dtype=tf.int64, allow_missing=True)
    #shape=[None, 3](axis=0上长度不定, axis=1上长度固定为3)的序列特征, 必须加allow_missing
    #如[[1, 2, 1], [2, 1, 1], [1, 1, 2], ...]
    "click_ids": tf.VarLenFeature(dtype=tf.int64)} #每个维度上都是变长度的序列特征
    #如[[1], [1, 2], [1, 2, 3]]

parse_example() 共有如下几类

  • tf.parse_example()
  • tf.parse_single_example(): 返回一个dict,如{"age": 18, "gender": 1}
  • tf.parse_single_sequence_example(): 返回两个dict,第一个是context dict(单值特征);同上,第二个是feature_list dict(序列特征), 如{"rates": [1, 3, 2], ...}

详细介绍见Tensorflow高阶读写教程


fixed_size_partitioner() 在tf的ps(parameter server)架构中,ps负责存储模型的参数,worker负责使用训练数据对参数进行更新。默认情况,tf会把参数按照round-robin的方式放到各个ps上。使用tf.fixed_size_partitioner()搭配tf.variable_scope()可以将参数在指定维度(axis)分割成指定份数(num_shards),详见Tensorflow参数分割

partitioner = tf.fixed_size_partitioner(num_of_ps, 0)
with tf.variable_scope("conv1", partitioner=partitioner):
    W_conv1 = weight_variable([5, 5, 1, 32], "conv1")

padded_batch() 可以把一个batch中不同长度的样本对齐,更多参见TFRecordDataset变长数据的batch读取

feature_map = {'sequence': tf.VarLenFeature(dtype=tf.float32),
               'sequence_len': tf.FixedLenFeature(shape=[], dtype=tf.int32),
               'label': tf.FixedLenFeature(shape=[], dtype=tf.int32)}
dataset = dataset.padded_batch(batch_size=10,
                               padded_shapes={'sequence': [], 'sequence_len': [], 'label': [None]})
#[]代表单值特征的shape, [None]代表变长序列特征的shape, 可能需要list而不是dict
dataset = dataset.padded_batch(batch_size=10,
                               padded_shapes=[[], [], [None]],
                               padding_values=None)
#也可以写成padded_shapes=list, 但是可读性差                             

参数padded_shapes指明dataset每条记录(每个特征)中各成员要pad成的形状:

  • 若单条记录是一个scalar(如点击label:1),则必须用[](即[]里不可填写内容),等价于没有pad;
  • 若单条记录是list(如点击序列:[item1, item2, item3]),用[max_length]可以把每一行pad到max_length的长度,用[None]可以把每一行pad到这个batch中最长样本的长度, 即[]里必须填写内容;
  • 若是array,则用[d1,...,dn]这里有一个多维的例子

tf.contrib.data.group_by_window() 考虑上文中的特征, 即数据集中的特征按顺序为sequencesequence_lenlabel

  • key_func()为每一行分配一个bucket ID,相同ID的行分配到一个window里,这里key_func()以每一行样本作为输入,输出为bucket ID;
  • reduce_func的作用是设定一个输出规则,该规则只是预先定义,在真正提取数据时候才会执行。该规则是对于同一个bucket中的最多window_size数量的数据做一个处理,然后将处理后的数据(官网叫another dataset)进行输出。(下面的例子演示的是padding,所以如果group_by_window时对sequence data做padding,前面就不要做了,否则会重复padding)
  • window_size一般就是用batch_size就好了,更多参见tensorflow nmt 的数据预处理过程
def key_func(unused_key1, sequence_len, unused_key2): #unused_key1指sequence, unused_key2指label
    if max_length > 1:
        bucket_width = (max_length + num_buckets - 1) // num_buckets
    else:
        bucket_width = 10
    bucket_id = tf.maximum(sequence_len // bucket_width, tgt_len // bucket_width)
    return tf.to_int64(tf.minimum(num_buckets, bucket_id))

def reduce_func(unused_key, windowed_data): #unused_key是指该bucket中的数据对应的bucket ID
    return windowed_data.padded_batch(
                         batch_size=10,
                         padded_shapes={'sequence': [], 'sequence_len': [], 'label': [None]})

dataset = dataset.apply(tf.contrib.data.group_by_window(key_func=key_func,
                                        reduce_func=reduce_func, window_size=batch_size))

tf.nn.in_top_k() 在多分类问题下,计算每一个样本的target是否在predictions的前k

predictions = [[1, 2, 5], [2, 4, 1], [2, 5, 1]] #必须rank=2
targets = [0, 2, 2] #必须rank=1
tf.nn.in_top_k(predictions, targets, k=2)
# 返回boolean数组 [False, True, False]

tf.dynamic_rnn 参考这里这里这里
tf.metrics.recall_at_top_k()

tf.train.MonitoredSession() 参考这里

tf.app.run() 参考这里


使用tfrecord打包不定长的序列数据

filename = "data/test.tfrecord"
labels = [1, 2, 3, 4, 5]
frames = [[1], [2, 2], [3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5, 5]]

with tf.python_io.TFRecordWriter(filename) as writer:
    for i in range(len(labels)):
        label, frame = labels[i], frames[i]
        label_feature = tf.train.Feature( #int64单值特征
            int64_list=tf.train.Int64List(value=[label]))
        frame_feature_list = tf.train.FeatureList( #int64序列特征, 每个element都是单值
            feature=[tf.train.Feature(int64_list=tf.train.Int64List(value=[i])) for i in frame])
        seq_example = tf.train.SequenceExample(
            context=tf.train.Features(feature={"label": label_feature}),
            feature_lists=tf.train.FeatureLists(feature_list={"frame": frame_feature_list}))
        serialized = seq_example.SerializeToString()
        writer.write(serialized)
    #如果用with, 则不需要writer.close()
reader = tf.TFRecordReader()
dataset = tf.data.TFRecordDataset('data/test.tfrecord')
def _parse(serialized_example):
    context_features = {"label": tf.FixedLenFeature([], dtype=tf.int64)}
    sequence_features = {"frame": tf.FixedLenSequenceFeature([], dtype=tf.int64)}
    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
        serialized=serialized_example,
        context_features=context_features,
        sequence_features=sequence_features)
    #根据parse_single_sequence_example的文档, FixedLenFeature, FixedLenSequenceFeature
    #都是解析为tensor, 而VarLenFeature会被解析为sparse tensor, 需要做一个sparse_to_dense转换
    #tf.sparse_tensor_to_dense()
    return context_parsed["label"], sequence_parsed["frame"]

dataset = dataset.map(_parse)
dataset = dataset.padded_batch(5, padded_shapes=([], [None])) #padded_shapes和_parse的返回对应
#对于含有变长序列数据, 必须通过padded_batch函数来batch, 使用dataset.batch会报错
iterator = dataset.make_one_shot_iterator()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(10):
        print(sess.run(iterator.get_next()))

卷积和池化

多进程

nce loss和sampled softmax loss

tf.nn.nce_loss

multi-head self attention

def self_multihead_attn(inputs, num_units, num_heads, key_masks, dropout_rate, is_training):
    if num_units is None:
        num_units = inputs.get_shape().as_list[-1]
    length = tf.reshape(tf.shape(inputs)[1], [-1])
    Q_K_V = tf.layers.dense(inputs, 3 * num_units, tf.nn.relu)  # [batch_size, seq_len, 3 * num_units]
    Q, K, V = tf.split(Q_K_V, 3, -1)  # both Q, K ,V is [batch_size, seq_len, num_units]

    Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)  # [num_heads * batch_size, seq_len, num_units / num_heads]
    K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)
    V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)  # (h*N, T_k, C/h)

    align = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # [num_heads * batch_size, seq_len, seq_len)
    align = align / (K_.get_shape().as_list()[-1] ** 0.5)  # scale

    paddings = tf.fill(tf.shape(align), float('-inf'))  # exp(-large) -> 0

    key_masks = tf.tile(key_masks, [num_heads, 1])  # (h*N, T_k)
    key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(inputs)[1], 1])  # (h*N, T_q, T_k)
    align = tf.where(key_masks, align, paddings)  # (h*N, T_q, T_k)

    # Future Binding
    lower_tri = tf.ones(tf.concat([length, length], axis=0))  # (T_q, T_k)
    lower_tri = tf.contrib.linalg.LinearOperatorTriL(lower_tri).to_dense()  # (T_q, T_k)
    masks = tf.tile(tf.expand_dims(lower_tri, 0), [tf.shape(align)[0], 1, 1])  # (h*N, T_q, T_k)
    align = tf.where(tf.equal(masks, 0), paddings, align)  # (h*N, T_q, T_k)

    # Softmax
    align = tf.nn.softmax(align)  # [num_heads * batch_size, seq_len, seq_len)
    tf.summary.histogram("attn_weights_dist", tf.trace(align) / tf.reduce_sum(align, axis=(1, 2)))
    tf.summary.scalar("attn_weights_sum", tf.reduce_sum(tf.trace(align)) / tf.reduce_sum(align))

    align = tf.layers.dropout(align, dropout_rate, training=is_training)  # (h*N, T_q, T_k)

    # Weighted sum
    outputs = tf.matmul(align, V_)  # (h*N, T_q, C/h)
    # Restore shape
    outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # [batch_size, seq_len, num_units]
    # Residual connection
    outputs += inputs  # (N, T_q, C)
    # Normalize
    outputs = layer_norm(outputs)  # (N, T_q, C)
    return outputs

Layer Norm

def layer_norm_compute(x, epsilon, scale, bias):
    """Layere norm raw computation"""
    mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
    var = tf.reduce_mean(tf.sequare(x - mean), axis=[-1], keepdims=True)
    norm_x = (x - mean) * tf.rsqrt(var + epsilon)
    return norm_x * scale + bias

Last updated on Jun 30, 2019.