tensorlayer/tensorlayer/rein.py at tutorial_fix · DeepLearningCode/tensorlayer

167 lines (140 loc) · 5.17 KB
#! /usr/bin/python
# -*- coding: utf-8 -*-
import numpy as np
import tensorflow as tf
from six.moves import xrange
__all__ = [
    'discount_episode_rewards',
    'cross_entropy_reward_loss',
    'log_weight',
    'choice_action_by_probs',
def discount_episode_rewards(rewards=None, gamma=0.99, mode=0):
    """Take 1D float array of rewards and compute discounted rewards for an
    episode. When encount a non-zero value, consider as the end a of an episode.
    Parameters
    ----------
    rewards : list
        List of rewards
    gamma : float
        Discounted factor
    mode : int
        Mode for computing the discount rewards.
            - If mode == 0, reset the discount process when encount a non-zero reward (Ping-pong game).
            - If mode == 1, would not reset the discount process.
    Returns
    --------
    list of float
        The discounted rewards.
    Examples
    ----------
    >>> rewards = np.asarray([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1])
    >>> gamma = 0.9
    >>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma)
    >>> print(discount_rewards)
    >>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma, mode=1)
    >>> print(discount_rewards)
    if rewards is None:
        raise Exception("rewards should be a list")
    discounted_r = np.zeros_like(rewards, dtype=np.float32)
    running_add = 0
    for t in reversed(xrange(0, rewards.size)):
        if mode == 0:
            if rewards[t] != 0: running_add = 0
        running_add = running_add * gamma + rewards[t]
        discounted_r[t] = running_add
    return discounted_r
def cross_entropy_reward_loss(logits, actions, rewards, name=None):
    """Calculate the loss for Policy Gradient Network.
    Parameters
    ----------
    logits : tensor
        The network outputs without softmax. This function implements softmax inside.
    actions : tensor or placeholder
        The agent actions.
    rewards : tensor or placeholder
        The rewards.
    Returns
    --------
        The TensorFlow loss function.
    Examples
    ----------
    >>> states_batch_pl = tf.placeholder(tf.float32, shape=[None, D])
    >>> network = InputLayer(states_batch_pl, name='input')
    >>> network = DenseLayer(network, n_units=H, act=tf.nn.relu, name='relu1')
    >>> network = DenseLayer(network, n_units=3, name='out')
    >>> probs = network.outputs
    >>> sampling_prob = tf.nn.softmax(probs)
    >>> actions_batch_pl = tf.placeholder(tf.int32, shape=[None])
    >>> discount_rewards_batch_pl = tf.placeholder(tf.float32, shape=[None])
    >>> loss = tl.rein.cross_entropy_reward_loss(probs, actions_batch_pl, discount_rewards_batch_pl)
    >>> train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
    try:  # TF 1.0+
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=actions, logits=logits, name=name)
    except Exception:
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, targets=actions)
        # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, actions)
    try:  ## TF1.0+
        loss = tf.reduce_sum(tf.multiply(cross_entropy, rewards))
    except Exception:  ## TF0.12
        loss = tf.reduce_sum(tf.mul(cross_entropy, rewards))  # element-wise mul
    return loss
def log_weight(probs, weights, name='log_weight'):
    """Log weight.
    Parameters
    -----------
    probs : tensor
        If it is a network output, usually we should scale it to [0, 1] via softmax.
    weights : tensor
        The weights.
    Returns
    --------
        The Tensor after appling the log weighted expression.
    with tf.variable_scope(name):
        exp_v = tf.reduce_mean(tf.log(probs) * weights)
        return exp_v
def choice_action_by_probs(probs=(0.5, 0.5), action_list=None):
    """Choice and return an an action by given the action probability distribution.
    Parameters
    ------------
    probs : list of float.
        The probability distribution of all actions.
    action_list : None or a list of int or others
        A list of action in integer, string or others. If None, returns an integer range between 0 and len(probs)-1.
    Returns
    --------
    float int or str
        The chosen action.
    Examples
    ----------
    >>> for _ in range(5):
    >>>     a = choice_action_by_probs([0.2, 0.4, 0.4])
    >>>     print(a)
    >>> for _ in range(3):
    >>>     a = choice_action_by_probs([0.5, 0.5], ['a', 'b'])
    >>>     print(a)
    if action_list is None:
        n_action = len(probs)
        action_list = np.arange(n_action)
        if len(action_list) != len(probs):
            raise Exception("number of actions should equal to number of probabilities.")
    return np.random.choice(action_list, p=probs)
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

FilesExpand file tree

rein.py

Latest commit

History

rein.py

File metadata and controls