# Supplementary Materials ## Example #1 ```python= !pip3 install gymnasium[classic_control] ``` ```python= import gymnasium as gym import math import random import matplotlib import matplotlib.pyplot as plt from collections import namedtuple, deque from itertools import count import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F env = gym.make("CartPole-v1") is_ipython = 'inline' in matplotlib.get_backend() if is_ipython: from IPython import display plt.ion() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ``` ```python= Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward')) class ReplayMemory(object): def __init__(self, capacity): self.memory = deque([], maxlen=capacity) def push(self, *args): """Save a transition""" self.memory.append(Transition(*args)) def sample(self, batch_size): return random.sample(self.memory, batch_size) def __len__(self): return len(self.memory) ``` ```python= class DQN(nn.Module): def __init__(self, n_observations, n_actions): super(DQN, self).__init__() self.layer1 = nn.Linear(n_observations, 128) self.layer2 = nn.Linear(128, 128) self.layer3 = nn.Linear(128, n_actions) def forward(self, x): x = F.relu(self.layer1(x)) x = F.relu(self.layer2(x)) return self.layer3(x) ``` ```python= BATCH_SIZE = 128 GAMMA = 0.99 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 1000 TAU = 0.005 LR = 1e-4 n_actions = env.action_space.n state, info = env.reset() n_observations = len(state) policy_net = DQN(n_observations, n_actions).to(device) target_net = DQN(n_observations, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True) memory = ReplayMemory(10000) ``` ```python= steps_done = 0 def select_action(state): global steps_done sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1 if sample > eps_threshold: with torch.no_grad(): return policy_net(state).max(1)[1].view(1, 1) else: return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long) ``` ```python= episode_durations = [] def plot_durations(show_result=False): plt.figure(1) durations_t = torch.tensor(episode_durations, dtype=torch.float) if show_result: plt.title('Result') else: plt.clf() plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Duration') plt.plot(durations_t.numpy()) if len(durations_t) >= 100: means = durations_t.unfold(0, 100, 1).mean(1).view(-1) means = torch.cat((torch.zeros(99), means)) plt.plot(means.numpy()) plt.pause(0.001) if is_ipython: if not show_result: display.display(plt.gcf()) display.clear_output(wait=True) else: display.display(plt.gcf()) ``` ```python= def optimize_model(): if len(memory) < BATCH_SIZE: return transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) state_action_values = policy_net(state_batch).gather(1, action_batch) next_state_values = torch.zeros(BATCH_SIZE, device=device) with torch.no_grad(): next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0] expected_state_action_values = (next_state_values * GAMMA) + reward_batch criterion = nn.SmoothL1Loss() loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1)) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100) optimizer.step() ``` ```python= if torch.cuda.is_available(): num_episodes = 600 else: num_episodes = 50 for i_episode in range(num_episodes): state, info = env.reset() state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0) for t in count(): action = select_action(state) observation, reward, terminated, truncated, _ = env.step(action.item()) reward = torch.tensor([reward], device=device) done = terminated or truncated if terminated: next_state = None else: next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0) memory.push(state, action, next_state, reward) state = next_state optimize_model() target_net_state_dict = target_net.state_dict() policy_net_state_dict = policy_net.state_dict() for key in policy_net_state_dict: target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU) target_net.load_state_dict(target_net_state_dict) if done: episode_durations.append(t + 1) plot_durations() break print('Complete') plot_durations(show_result=True) plt.ioff() plt.show() ``` ## Example #2 ```python= !pip install transformers ``` ```python= from transformers import pipeline classifier = pipeline("sentiment-analysis") res = classifier("I love the research.") print(res) ``` ```python= from transformers import pipeline from transformers import AutoTokenizer, AutoModelForSequenceClassification model_name = "distilbert-base-uncased-finetuned-sst-2-english" model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) classifier = pipeline("sentiment-analysis") res = classifier("I love the research.") print(res) ``` ```python= sequence = "I'm so hungry" res = tokenizer(sequence) print(res) tokens = tokenizer.tokenize(sequence) print(tokens) ids = tokenizer.convert_tokens_to_ids(tokens) print(ids) decoded_string = tokenizer.decode(ids) print(decoded_string) ``` ```python= from transformers import pipeline generator = pipeline("text-generation", model="distilgpt2") res = generator("I love the research, because", max_length=30, num_return_sequences=2) print(res) ``` ```python= from transformers import pipeline classifier = pipeline("zero-shot-classification") res = classifier("This is a good starting point of learning RL.", candidate_labels=["education", "politics", "business"]) print(res) ``` ```python= from transformers import pipeline from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import torch.nn.functional as F model_name = "distilbert-base-uncased-finetuned-sst-2-english" model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) X_train = ["I like a pizza.", "I like a burger.", "I don't like a chiken."] res = classifier(X_train) print(res) batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt") print(batch) with torch.no_grad(): outputs = model(**batch) print(outputs) predictions = F.softmax(outputs.logits, dim=1) print(predictions) labels = torch.argmax(predictions, dim=1) print(labels) ``` ## Example #3 ```python= !apt install python-opengl !apt install ffmpeg !apt install xvfb !pip3 install pyvirtualdisplay !pip install stable-baselines3[extra] !pip install gymnasium !pip install huggingface_sb3 !pip install huggingface_hub !pip install panda_gym ``` ```python= # Virtual display from pyvirtualdisplay import Display virtual_display = Display(visible=0, size=(1400, 900)) virtual_display.start() ``` ```python= import os import gymnasium as gym import panda_gym from huggingface_sb3 import load_from_hub, package_to_hub from stable_baselines3 import A2C from stable_baselines3.common.evaluation import evaluate_policy from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize from stable_baselines3.common.env_util import make_vec_env from huggingface_hub import notebook_login ``` ```python= env_id = "PandaReachDense-v3" # Create the env env = gym.make(env_id) # Get the state space and action space s_size = env.observation_space.shape a_size = env.action_space ``` ```python= print("_____OBSERVATION SPACE_____ \n") print("The State Space is: ", s_size) print("Sample observation", env.observation_space.sample()) # Get a random observation print("\n _____ACTION SPACE_____ \n") print("The Action Space is: ", a_size) print("Action Space Sample", env.action_space.sample()) # Take a random action ``` ```python= env = make_vec_env(env_id, n_envs=4) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) ``` ```python= model = A2C(policy = "MultiInputPolicy", env = env, verbose=1) ``` ```python= model.learn(1_000_000) # model.learn(100) # Save the model and VecNormalize statistics when saving the agent model.save("a2c-PandaReachDense-v3") env.save("vec_normalize.pkl") ``` ```python= from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize # Load the saved statistics eval_env = DummyVecEnv([lambda: gym.make("PandaReachDense-v3")]) eval_env = VecNormalize.load("vec_normalize.pkl", eval_env) # We need to override the render_mode eval_env.render_mode = "rgb_array" # do not update them at test time eval_env.training = False # reward normalization is not needed at test time eval_env.norm_reward = False # Load the agent model = A2C.load("a2c-PandaReachDense-v3") mean_reward, std_reward = evaluate_policy(model, eval_env) print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}") ``` ```python= notebook_login() !git config --global credential.helper store ``` ```python= from huggingface_sb3 import package_to_hub package_to_hub( model=model, model_name=f"a2c-{env_id}", model_architecture="A2C", env_id=env_id, eval_env=eval_env, repo_id=f"CJJ1234/a2c-{env_id}", # Change the username commit_message="Initial commit", ) ```