forked from tensorlayer/TensorLayer
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrein.py
More file actions
167 lines (140 loc) · 5.17 KB
/
Copy pathrein.py
File metadata and controls
167 lines (140 loc) · 5.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#! /usr/bin/python
# -*- coding: utf-8 -*-
import numpy as np
import tensorflow as tf
from six.moves import xrange
__all__ = [
'discount_episode_rewards',
'cross_entropy_reward_loss',
'log_weight',
'choice_action_by_probs',
]
def discount_episode_rewards(rewards=None, gamma=0.99, mode=0):
"""Take 1D float array of rewards and compute discounted rewards for an
episode. When encount a non-zero value, consider as the end a of an episode.
Parameters
----------
rewards : list
List of rewards
gamma : float
Discounted factor
mode : int
Mode for computing the discount rewards.
- If mode == 0, reset the discount process when encount a non-zero reward (Ping-pong game).
- If mode == 1, would not reset the discount process.
Returns
--------
list of float
The discounted rewards.
Examples
----------
>>> rewards = np.asarray([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1])
>>> gamma = 0.9
>>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma)
>>> print(discount_rewards)
... [ 0.72899997 0.81 0.89999998 1. 0.72899997 0.81
... 0.89999998 1. 0.72899997 0.81 0.89999998 1. ]
>>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma, mode=1)
>>> print(discount_rewards)
... [ 1.52110755 1.69011939 1.87791049 2.08656716 1.20729685 1.34144104
... 1.49048996 1.65610003 0.72899997 0.81 0.89999998 1. ]
"""
if rewards is None:
raise Exception("rewards should be a list")
discounted_r = np.zeros_like(rewards, dtype=np.float32)
running_add = 0
for t in reversed(xrange(0, rewards.size)):
if mode == 0:
if rewards[t] != 0: running_add = 0
running_add = running_add * gamma + rewards[t]
discounted_r[t] = running_add
return discounted_r
def cross_entropy_reward_loss(logits, actions, rewards, name=None):
"""Calculate the loss for Policy Gradient Network.
Parameters
----------
logits : tensor
The network outputs without softmax. This function implements softmax inside.
actions : tensor or placeholder
The agent actions.
rewards : tensor or placeholder
The rewards.
Returns
--------
Tensor
The TensorFlow loss function.
Examples
----------
>>> states_batch_pl = tf.placeholder(tf.float32, shape=[None, D])
>>> network = InputLayer(states_batch_pl, name='input')
>>> network = DenseLayer(network, n_units=H, act=tf.nn.relu, name='relu1')
>>> network = DenseLayer(network, n_units=3, name='out')
>>> probs = network.outputs
>>> sampling_prob = tf.nn.softmax(probs)
>>> actions_batch_pl = tf.placeholder(tf.int32, shape=[None])
>>> discount_rewards_batch_pl = tf.placeholder(tf.float32, shape=[None])
>>> loss = tl.rein.cross_entropy_reward_loss(probs, actions_batch_pl, discount_rewards_batch_pl)
>>> train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
"""
try: # TF 1.0+
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=actions, logits=logits, name=name)
except Exception:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, targets=actions)
# cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, actions)
try: ## TF1.0+
loss = tf.reduce_sum(tf.multiply(cross_entropy, rewards))
except Exception: ## TF0.12
loss = tf.reduce_sum(tf.mul(cross_entropy, rewards)) # element-wise mul
return loss
def log_weight(probs, weights, name='log_weight'):
"""Log weight.
Parameters
-----------
probs : tensor
If it is a network output, usually we should scale it to [0, 1] via softmax.
weights : tensor
The weights.
Returns
--------
Tensor
The Tensor after appling the log weighted expression.
"""
with tf.variable_scope(name):
exp_v = tf.reduce_mean(tf.log(probs) * weights)
return exp_v
def choice_action_by_probs(probs=(0.5, 0.5), action_list=None):
"""Choice and return an an action by given the action probability distribution.
Parameters
------------
probs : list of float.
The probability distribution of all actions.
action_list : None or a list of int or others
A list of action in integer, string or others. If None, returns an integer range between 0 and len(probs)-1.
Returns
--------
float int or str
The chosen action.
Examples
----------
>>> for _ in range(5):
>>> a = choice_action_by_probs([0.2, 0.4, 0.4])
>>> print(a)
... 0
... 1
... 1
... 2
... 1
>>> for _ in range(3):
>>> a = choice_action_by_probs([0.5, 0.5], ['a', 'b'])
>>> print(a)
... a
... b
... b
"""
if action_list is None:
n_action = len(probs)
action_list = np.arange(n_action)
else:
if len(action_list) != len(probs):
raise Exception("number of actions should equal to number of probabilities.")
return np.random.choice(action_list, p=probs)