# NPL in PyTorch for Monkey: 2. Feed-forward Networks for NLP This tutorial is a cheatsheet of the book "<i class="fa fa-book fa-fw"></i> Natural Language Processing with PyTorch: Build Intelligent Language Applications Using Deep Learning". https://github.com/delip/PyTorchNLPBook Outlines --- 1. [PyTorch Basics](https://hackmd.io/@martinliu/Hkt4VBggi) 2. Feed-forward Networks for NLP 3. Embedding Words and Types 4. Sequence Modeling for NLP 5. Intermediate Sequence Modeling for NLP 6. Advanced Sequence Modeling for NLP 7. My Note --- ### Perceptron in binary classification task --- Implementing a perceptron using PyTorch ``` import numpy as np import torch import torch.nn as nn seed = 1337 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) %matplotlib inline ``` Sigmoid activation ``` import torch import matplotlib.pyplot as plt x = torch.arange(-5., 5., 0.1) y = torch.sigmoid(x) plt.plot(x.numpy(), y.detach().numpy()) plt.show() ``` Tanh activation ``` import torch import matplotlib.pyplot as plt x = torch.arange(-5., 5., 0.1) y = torch.tanh(x) plt.plot(x.numpy(), y.detach().numpy()) plt.show() ``` ReLU activation ``` import torch import matplotlib.pyplot as plt relu = torch.nn.ReLU() x = torch.arange(-5., 5., 0.1) y = relu(x) plt.plot(x.numpy(), y.detach().numpy()) plt.show() ``` PReLU activation ``` import torch.nn as nn import matplotlib.pyplot as plt prelu = nn.PReLU(num_parameters=1) x = torch.arange(-5., 5., 0.1) y = prelu(x) plt.plot(x.numpy(), y.detach().numpy()) plt.show() ``` Softmax activation ``` softmax = nn.Softmax(dim=1) x_input = torch.randn(1, 3) y_output = softmax(x_input) print(x_input) tensor([[-2.0260, -2.0655, -1.2054]]) print(y_output) tensor([[0.2362, 0.2271, 0.5367]]) print(torch.sum(y_output, dim=1)) tensor([1.]) ``` MSE loss ``` import torch import torch.nn as nn mse_loss = nn.MSELoss() outputs = torch.randn(3, 5, requires_grad=True) targets = torch.randn(3, 5) loss = mse_loss(outputs, targets) loss.backward() print(loss) tensor(1.6031, grad_fn=<MseLossBackward>) ``` Cross-entropy loss ``` import torch import torch.nn as nn ce_loss = nn.CrossEntropyLoss() outputs = torch.randn(3, 5, requires_grad=True) targets = torch.tensor([1, 0, 3], dtype=torch.int64) loss = ce_loss(outputs, targets) loss.backward() print (loss) tensor(2.5949, grad_fn=<NllLossBackward>) ``` Binary cross-entropy loss ``` bce_loss = nn.BCELoss() sigmoid = nn.Sigmoid() probabilities = sigmoid(torch.randn(4, 1, requires_grad=True)) print(probabilities) tensor([[0.7411], [0.5622], [0.8286], [0.6569]], grad_fn=<SigmoidBackward>) targets = torch.tensor([1, 0, 1, 0], dtype=torch.float32).view(4, 1) loss = bce_loss(probabilities, targets) loss.backward() print(loss) tensor(0.5958, grad_fn=<BinaryCrossEntropyBackward>) ``` --- ### Diving Deep into Supervised Training #### Instantiating the Adam optimizer Global Settings ``` import matplotlib.pyplot as plt import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim %matplotlib inline LEFT_CENTER = (3, 3) RIGHT_CENTER = (3, -2) ``` Defining the Model ``` class Perceptron(nn.Module): """ A Perceptron is one Linear layer """ def __init__(self, input_dim): """ Args: input_dim (int): size of the input features """ super(Perceptron, self).__init__() self.fc1 = nn.Linear(input_dim, 1) def forward(self, x_in): """The forward pass of the MLP Args: x_in (torch.Tensor): an input data tensor. x_in.shape should be (batch, input_dim) Returns: the resulting tensor. tensor.shape should be (batch, 1) """ return torch.sigmoid(self.fc1(x_in)) ``` Get Data Function ``` def get_toy_data(batch_size, left_center=LEFT_CENTER, right_center=RIGHT_CENTER): x_data = [] y_targets = np.zeros(batch_size) for batch_i in range(batch_size): if np.random.random() > 0.5: x_data.append(np.random.normal(loc=left_center)) else: x_data.append(np.random.normal(loc=right_center)) y_targets[batch_i] = 1 return torch.tensor(x_data, dtype=torch.float32), torch.tensor(y_targets, dtype=torch.float32) ``` Visualizing Results Function ``` def visualize_results(perceptron, x_data, y_truth, n_samples=1000, ax=None, epoch=None, title='', levels=[0.3, 0.4, 0.5], linestyles=['--', '-', '--']): y_pred = perceptron(x_data) y_pred = (y_pred > 0.5).long().data.numpy().astype(np.int32) x_data = x_data.data.numpy() y_truth = y_truth.data.numpy().astype(np.int32) n_classes = 2 all_x = [[] for _ in range(n_classes)] all_colors = [[] for _ in range(n_classes)] colors = ['black', 'white'] markers = ['o', '*'] for x_i, y_pred_i, y_true_i in zip(x_data, y_pred, y_truth): all_x[y_true_i].append(x_i) if y_pred_i == y_true_i: all_colors[y_true_i].append("white") else: all_colors[y_true_i].append("black") #all_colors[y_true_i].append(colors[y_pred_i]) all_x = [np.stack(x_list) for x_list in all_x] if ax is None: _, ax = plt.subplots(1, 1, figsize=(10,10)) for x_list, color_list, marker in zip(all_x, all_colors, markers): ax.scatter(x_list[:, 0], x_list[:, 1], edgecolor="black", marker=marker, facecolor=color_list, s=300) xlim = (min([x_list[:,0].min() for x_list in all_x]), max([x_list[:,0].max() for x_list in all_x])) ylim = (min([x_list[:,1].min() for x_list in all_x]), max([x_list[:,1].max() for x_list in all_x])) # hyperplane xx = np.linspace(xlim[0], xlim[1], 30) yy = np.linspace(ylim[0], ylim[1], 30) YY, XX = np.meshgrid(yy, xx) xy = np.vstack([XX.ravel(), YY.ravel()]).T Z = perceptron(torch.tensor(xy, dtype=torch.float32)).detach().numpy().reshape(XX.shape) ax.contour(XX, YY, Z, colors='k', levels=levels, linestyles=linestyles) plt.suptitle(title) if epoch is not None: plt.text(xlim[0], ylim[1], "Epoch = {}".format(str(epoch))) ``` #### A supervised training loop for a perceptron and binary classification Initial Data Plot ``` seed = 1337 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) x_data, y_truth = get_toy_data(batch_size=1000) x_data = x_data.data.numpy() y_truth = y_truth.data.numpy() left_x = [] right_x = [] left_colors = [] right_colors = [] for x_i, y_true_i in zip(x_data, y_truth): color = 'black' if y_true_i == 0: left_x.append(x_i) left_colors.append(color) else: right_x.append(x_i) right_colors.append(color) left_x = np.stack(left_x) right_x = np.stack(right_x) _, ax = plt.subplots(1, 1, figsize=(10,4)) ax.scatter(left_x[:, 0], left_x[:, 1], color=left_colors, marker='*', s=100) ax.scatter(right_x[:, 0], right_x[:, 1], facecolor='white', edgecolor=right_colors, marker='o', s=100) plt.axis('off'); ``` The Training + intermittent data plots ``` lr = 0.01 input_dim = 2 batch_size = 1000 n_epochs = 12 n_batches = 5 seed = 1337 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) perceptron = Perceptron(input_dim=input_dim) optimizer = optim.Adam(params=perceptron.parameters(), lr=lr) bce_loss = nn.BCELoss() losses = [] x_data_static, y_truth_static = get_toy_data(batch_size) fig, ax = plt.subplots(1, 1, figsize=(10,5)) visualize_results(perceptron, x_data_static, y_truth_static, ax=ax, title='Initial Model State') plt.axis('off') #plt.savefig('initial.png') change = 1.0 last = 10.0 epsilon = 1e-3 epoch = 0 while change > epsilon or epoch < n_epochs or last > 0.3: #for epoch in range(n_epochs): for _ in range(n_batches): optimizer.zero_grad() x_data, y_target = get_toy_data(batch_size) y_pred = perceptron(x_data).squeeze() loss = bce_loss(y_pred, y_target) loss.backward() optimizer.step() loss_value = loss.item() losses.append(loss_value) change = abs(last - loss_value) last = loss_value fig, ax = plt.subplots(1, 1, figsize=(10,5)) visualize_results(perceptron, x_data_static, y_truth_static, ax=ax, epoch=epoch, title=f"{loss_value}; {change}") plt.axis('off') epoch += 1 #plt.savefig('epoch{}_toylearning.png'.format(epoch)) ``` Final model and plot results ``` _, axes = plt.subplots(1,2,figsize=(12,4)) axes[0].scatter(left_x[:, 0], left_x[:, 1], facecolor='white',edgecolor='black', marker='o', s=300) axes[0].scatter(right_x[:, 0], right_x[:, 1], facecolor='white', edgecolor='black', marker='*', s=300) axes[0].axis('off'); visualize_results(perceptron, x_data_static, y_truth_static, epoch=None, levels=[0.5], ax=axes[1]) axes[1].axis('off'); plt.savefig('perceptron_final.png') plt.savefig('perceptron_final.pdf') ``` --- ### Classifying Sentiment of Restaurant Reviews Dataset preprocessing ``` ``` ``` ``` Example 3-12. Creating training, validation, and testing splits ``` ``` Example 3-13. Minimally cleaning the data ``` ``` Example 3-14. A PyTorch Dataset class for the Yelp Review dataset ``` ``` Example 3-15. The Vocabulary class maintains token to integer mapping needed for the rest of the machine learning pipeline ``` ``` Example 3-16. The Vectorizer class converts text to numeric vectors ``` ``` Example 3-17. Generating minibatches from a dataset ``` ``` Example 3-18. A perceptron classifier for classifying Yelp reviews ``` ``` Example 3-19. Hyperparameters and program options for the perceptron-based Yelp review classifier ``` ``` Example 3-20. Instantiating the dataset, model, loss, optimizer, and training state ``` ``` Example 3-21. A bare-bones training loop ``` ``` Example 3-22. Test set evaluation ``` ``` Example 3-23. Printing the prediction for a sample review ``` ``` Example 3-24. Inspecting what the classifier learned ``` ``` ###### tags: `Python` `pytorch` `NPL`