{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## ä¸ãRNNä»é¶å¼å§å®ç°"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import math\n",
"import torch\n",
"from torch import nn\n",
"from torch.nn import functional as F\n",
"from d2l import torch as d2l\n",
"\n",
"batch_size, num_steps = 32, 35\n",
"train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### ç¬çç¼ç "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0],\n",
" [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0]])"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"F.one_hot(torch.tensor([0, 2]), len(vocab)) #å°[0, 2]å±å¼ä¸ºé¿åº¦ä¸ºlen(vocab)大å°çç¬çåé"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### å°æ¹éæ°æ®å½¢ç¶æ¯ (æ¹é大å°, æ¶é´æ¥æ°)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([5, 2, 28])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = torch.arange(10).reshape((2, 5)) # (batch_size, n_step)\n",
"F.one_hot(X.T, 28).shape # (n_step, batch_size, n_features)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### åå§å循ç¯ç¥ç»ç½ç»æ¨¡åçæ¨¡ååæ°"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def get_params(vocab_size, num_hiddens, device):\n",
" num_inputs = num_outputs = vocab_size\n",
" \n",
" def normal(shape):\n",
" return torch.randn(size=shape, device=device) * 0.01\n",
" \n",
" #éèå±åæ°\n",
" W_xh = normal((num_inputs, num_hiddens))\n",
" W_hh = normal((num_hiddens, num_hiddens))\n",
" b_h = torch.zeros(num_hiddens, device=device)\n",
" #è¾åºåæ°\n",
" W_hq = normal((num_hiddens, num_outputs))\n",
" b_q = torch.zeros(num_outputs, device=device)\n",
" #éå æ¢¯åº¦\n",
" params = [W_xh, W_hh, b_h, W_hq, b_q]\n",
" for param in params:\n",
" param.requires_grad_(True)\n",
" return params"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### init_rnn_state彿°ï¼å¨åå§åæ¶è¿åéèç¶æ"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def init_rnn_state(batch_size, num_hiddens, device): #è¿ååå§éå±ç¶æ\n",
" return (torch.zeros((batch_size, num_hiddens), device=device), )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### rnn彿°ï¼å®ä¹**ä¸ä¸ªæ¶é´æ¥å
**计ç®éèç¶æåè¾åº\n",
"æ´æ°éèç¶æ: $$h_t = \\phi(W_{hh}h_{t-1}+W_{hx}x_{t-1}+b_{h})$$\n",
"è¾åº: $$o_{t}=\\phi(W_{ho}h_{t}+b_{o})$$"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def rnn(inputs, state, params):\n",
" W_xh, W_hh, b_h, W_hq, b_q = params\n",
" H, = state\n",
" outputs = [] #n_step个大å°ä¸º(batch_size, n_outputs)çtorchå¼ éå表\n",
" # inputs: (n_step, batch_size, n_features)\n",
" for X in inputs: # ææ¶åºéå\n",
" H = torch.tanh(torch.mm(X, W_xh) + torch.mm(H, W_hh) + b_h) # (batch_size, n_hiddens)\n",
" Y = torch.mm(H, W_hq) + b_q # (batch_size, n_outputs)\n",
" outputs.append(Y) \n",
" return torch.cat(outputs, dim=0), (H,) #catåç»´æ°(n_step * batch_size, n_outputs)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### å建ä¸ä¸ªç±»æ¥å
è£
è¿äºå½æ°"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"class RNNModelScratch:\n",
" \"\"\"ä»é¶å¼å§å®ç°ç循ç¯ç¥ç»ç½ç»æ¨¡å\"\"\"\n",
" def __init__(self, vocab_size, num_hiddens, device, get_params,\n",
" init_state, forward_fn):\n",
" self.vocab_size, self.num_hiddens = vocab_size, num_hiddens\n",
" self.params = get_params(vocab_size, num_hiddens, device) #è·å¾æ¨¡ååå§åæ°\n",
" self.init_state, self.forward_fn = init_state, forward_fn #éå±åå§å½æ°, åé¦å½æ° \n",
" #注æåé¦å½æ°å¯ä»¥æ¢ægru, lstmç\n",
" \n",
" def __call__(self, X, state):\n",
" #è¾å
¥X: (batch_size, n_step)\n",
" #转置ï¼onehotå X:(n_step, batch_size, n_features)\n",
" X = F.one_hot(X.T, self.vocab_size).type(torch.float32)\n",
" return self.forward_fn(X, state, self.params)\n",
" \n",
" def begin_state(self, batch_size, device):\n",
" return self.init_state(batch_size, self.num_hiddens, device)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### æ£æ¥è¾åºæ¯å¦å
·ææ£ç¡®çå½¢ç¶"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(torch.Size([10, 28]), 1, torch.Size([2, 512]))"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"num_hiddens = 512\n",
"net = RNNModelScratch(len(vocab), num_hiddens, d2l.try_gpu(), \n",
" get_params, init_rnn_state, rnn)\n",
"# X: (2, 5) 对åº(batch_size, n_step)\n",
"state = net.begin_state(X.shape[0], d2l.try_gpu())\n",
"Y, new_state = net(X.to(d2l.try_gpu()), state)\n",
"#Y: (batch_size * n_step, n_outputs) \n",
"#new_stateä¸ä¸ä¸ªtorchå¼ é(æåä¸ä¸ªæ¶é´æ¥çéå±)\n",
"#new_state[0].shape: (batch_size, n_hiddens)\n",
"Y.shape, len(new_state), new_state[0].shape"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### é¦å
å®ä¹é¢æµå½æ°æ¥çæprefixä¹åçæ°å符"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'time travellerrrrrrrrrrr'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def predict_ch8(prefix, num_preds, net, vocab, device):\n",
" \"\"\"å¨`prefix`åé¢çææ°å符\"\"\"\n",
" #çæåå§éèç¶æ\n",
" state = net.begin_state(batch_size=1, device=device) \n",
" outputs = [vocab[prefix[0]]] #第ä¸ä¸ªwordçæ´å䏿 \n",
" #å°æè¿é¢æµçè¯åætensor, batch_size=1, n_step=1\n",
" get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1, 1))\n",
" for y in prefix[1:]: # é¢çæä½, ä¿åçå¼\n",
" _, state = net(get_input(), state)\n",
" outputs.append(vocab[y])\n",
" for _ in range(num_preds): # 颿µnum_predsæ¥\n",
" y, state = net(get_input(), state)\n",
" outputs.append(int(y.argmax(dim=1).reshape(1)))\n",
" return ''.join([vocab.idx_to_token[i] for i in outputs])\n",
"\n",
"predict_ch8('time traveller', 10, net, vocab, d2l.try_gpu())"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### 梯度è£åª\n",
"$$\\mathbf{g}\\leftarrow min(1, \\frac{\\theta}{\\parallel \\mathbf{g} \\parallel}) \\mathbf{g}$$"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def grad_clipping(net, theta):\n",
" \"\"\"è£åªæ¢¯åº¦\"\"\"\n",
" if isinstance(net, nn.Module):#å¦æä½¿ç¨nn.Moduleæ¥å®ç°\n",
" params = [p for p in net.parameters() if p.requires_grad]\n",
" else:\n",
" params = net.params\n",
" norm = torch.sqrt(sum(torch.sum(\n",
" (p.grad**2)) for p in params))\n",
" if norm > theta:\n",
" for param in params:\n",
" param.grad[:] *= theta / norm"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### æ¥çtrain_iteræ°æ®é"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([32, 35]) torch.Size([32, 35])\n",
"tensor([[ 1, 3, 5, ..., 2, 1, 15],\n",
" [ 4, 6, 11, ..., 5, 10, 8],\n",
" [ 3, 1, 4, ..., 2, 8, 8],\n",
" ...,\n",
" [15, 7, 6, ..., 21, 14, 3],\n",
" [10, 19, 8, ..., 14, 8, 3],\n",
" [ 1, 13, 2, ..., 10, 1, 4]])\n",
"tensor([[ 3, 5, 13, ..., 1, 15, 7],\n",
" [ 6, 11, 20, ..., 10, 8, 1],\n",
" [ 1, 4, 6, ..., 8, 8, 1],\n",
" ...,\n",
" [ 7, 6, 26, ..., 14, 3, 21],\n",
" [19, 8, 3, ..., 8, 3, 1],\n",
" [13, 2, 15, ..., 1, 4, 6]])\n",
" time traveller for so it will be c\n",
"time traveller for so it will be co\n",
"andpassed in our glasses our chairs\n",
"ndpassed in our glasses our chairs \n",
"\n",
"onvenient to speak of himwas expoun\n",
"nvenient to speak of himwas expound\n",
"8\n"
]
}
],
"source": [
"count = 0\n",
"for X, Y in train_iter:\n",
" if count == 0:#第0个batch\n",
" print(X.shape, Y.shape)\n",
" print(X) # (batch_size(=32), n_step(=35))\n",
" print(Y) # (batch_size, n_step)\n",
" print(''.join([vocab.idx_to_token[i] for i in X[0]])) #æå°ç¬¬0ä¸ªæ ·æ¬å¯¹åºå¥å\n",
" print(''.join([vocab.idx_to_token[i] for i in Y[0]])) #æå°ç¬¬0ä¸ªæ ·æ¬çå¼\n",
" print(''.join([vocab.idx_to_token[i] for i in X[1]])) #æå°ç¬¬1ä¸ªæ ·æ¬å¯¹åºå¥å\n",
" print(''.join([vocab.idx_to_token[i] for i in Y[1]])) #æå°ç¬¬1ä¸ªæ ·æ¬çå¼\n",
" print()\n",
" if count == 1:#第1个batch, å
容å第0个batchä¸ä¸æ¿æ¥(ææ¶åºå
³ç³»)\n",
" print(''.join([vocab.idx_to_token[i] for i in X[0]]))\n",
" print(''.join([vocab.idx_to_token[i] for i in Y[0]]))\n",
" count += 1\n",
"print(count) #æå°batchæ°é=8"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### å®ä¹ä¸ä¸ªå½æ°å¨ä¸ä¸ªè¿ä»£å¨æå
è®ç»æ¨¡å"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def train_epoch_ch8(net, train_iter, loss, updater, device,\n",
" use_random_iter):\n",
" \"\"\"è®ç»æ¨¡åä¸ä¸ªè¿ä»£å¨æï¼å®ä¹è§ç¬¬8ç« ï¼\"\"\"\n",
" state, timer = None, d2l.Timer()\n",
" metric = d2l.Accumulator(2)\n",
" for X, Y in train_iter:\n",
" if state is None or use_random_iter:#为第ä¸ä¸ªbatch æè
batchä¹é´æ¶åºä¸ä¸è¿ç»\n",
" state = net.begin_state(batch_size=X.shape[0], device=device) #åå§åstate\n",
" else:\n",
" if isinstance(net, nn.Module) and not isinstance(state, tuple):\n",
" # state对äºnn.GRUæ¯ä¸ªå¼ é\n",
" state.detach_() # 对ä¹åçé¨ååæ¶æ¢¯åº¦ååä¼ æè®¡ç®\n",
" else:\n",
" # state对äºnn.LSTMæè
å¯¹äºæä»¬ä»é¶å¼å§å®ç°çæ¨¡åæ¯ä¸ªå
ç»(å¼ éææ)\n",
" for s in state:\n",
" s.detach_()\n",
" y = Y.T.reshape(-1) #reshapeçå¼, å°n_stepæ¾å¨ç¬¬ä¸ç»´ä¹åææä¸ç»´åé\n",
" X, y = X.to(device), y.to(device)\n",
" y_hat,state = net(X, state)\n",
" l = loss(y_hat, y.long()).mean()\n",
" if isinstance(updater, torch.optim.Optimizer):#è°ç¨torchä¼å彿°å®ç°\n",
" updater.zero_grad()\n",
" l.backward()\n",
" grad_clipping(net, 1)\n",
" updater.step()\n",
" else:\n",
" l.backward()\n",
" grad_clipping(net, 1)\n",
" updater(batch_size=1)\n",
" metric.add(l * y.numel(), y.numel())\n",
" return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### 循ç¯ç¥ç»ç½ç»æ¨¡åçè®ç»å½æ°æ¢æ¯æä»é¶å¼å§å®ç°ï¼ä¹å¯ä»¥ä½¿ç¨é«çº§APIå®ç°"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def train_ch8(net, train_iter, vocab, lr, num_epochs, device, use_random_iter=False):\n",
" \"\"\"è®ç»æ¨¡åï¼å®ä¹è§ç¬¬8ç« ï¼\"\"\"\n",
" loss = nn.CrossEntropyLoss()\n",
" animator = d2l.Animator(xlabel='epoch', ylabel='perplexity',\n",
" legend=['train'], xlim=[10,num_epochs])\n",
" #åå§åä¼åå¨\n",
" if isinstance(net, nn.Module):\n",
" updater = torch.optim.SGD(net.parameters(), lr)\n",
" else:\n",
" updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)\n",
" predict = lambda prefix: predict_ch8(prefix, 50, net, vocab, device)\n",
" #è®ç»å颿µ\n",
" for epoch in range(num_epochs):\n",
" ppl, speed = train_epoch_ch8(\n",
" net, train_iter, loss, updater, device, use_random_iter)\n",
" if (epoch + 1) % 10 == 0:\n",
" print(predict('time traveller'))\n",
" animator.add(epoch+1, [ppl])\n",
" print(f'å°æåº¦ {ppl:.1f}, {speed:.1f} è¯å
/ç§ {str(device)}')\n",
" print(predict('time traveller'))\n",
" print(predict('traveller'))"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### è®ç»å¾ªç¯ç¥ç»ç½ç»æ¨¡å(æåºè¿ä»£batch)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"å°æåº¦ 1.0, 46320.2 è¯å
/ç§ cuda:0\n",
"time travelleryou can show black is white by argument said filby\n",
"travelleryou can show black is white by argument said filby\n"
]
},
{
"data": {
"image/svg+xml": [
"\r\n",
"\r\n",
"\r\n",
"\r\n"
],
"text/plain": [
"