In-context learning dataset

Stuffs below are implemented in this notebook with more details : https://colab.research.google.com/drive/19IZiHeZTWK6pjGsrq0iJRJhXuaWNfzpM?usp=sharing For $p, q, m \in \mathbb{N}^*$, let : * $\mathcal{S} = [p]^q$ : sequence of length $q$ on the alphabet $[p] = \{0, 1, \cdots, p-1\}$. We have $|\mathcal{S}| = p^q$ * Another way of looking at it is to just consider $\mathcal{S}=\{0, 1, \cdots, p^q-1 \}$, the set of all integers in base $p$ (for example $p=10$ for decimals) of lengths less than or equal to $q$ (hence $p^q-1$ being the bigger one), but with the sequences completed in front with $0$'s to get the sequences of lengths $q$ (if $q=3$ and $p=2$ for example, $1$ becomes $001$, $101$ remains $101$, etc). This way of looking at things is more flexible for generating sequences, since all we have to do is generate integers between $0$ and $p^q-1$ (which is quite easy to do), then complete them. * $\mathcal{F} = S_q$ : symmetric group $S_q$, i.e. set of permutation of $q$ elements. We have $|\mathcal{F}| = q!$ * We randomly partition $\mathcal{F}$ into two disjoint and non-empty sets $\mathcal{F}_{\text{train}}$ and $\mathcal{F}_{\text{val}}$, for the training and the validation dataset respectively. The training data fraction $r = |\mathcal{F}_{\text{train}}| / |\mathcal{F}|$ is a hyperparameter. Each training/validation example is generate as follow : * (Randomly) pick a permutation $\sigma \in \mathcal{F}_{\text{train}/\text{val}}$ * (Randomly) generate, without repetition, a set $\{ x_i, y_i = \sigma(x_i) \}_{i=1}^m \subset \left( \mathcal{S} \times \mathcal{S} \right)^m$ of $m$ sequence with their corresponding permutation, and another $(x, y=\sigma(x)) \in \mathcal{S}^2$ : $s = {{|\mathcal{S}|}\choose{m+1}} = \frac{|\mathcal{S}|!}{\left( |\mathcal{S}| - m - 1 \right)! (m+1)!}$ possibilities if the order in which we choose the sequences does not matter, and $s = (m+1)! \times{{|\mathcal{S}|}\choose{m+1}} = \frac{|\mathcal{S}|!}{\left( |\mathcal{S}| - m - 1 \right)!}$ otherwise. * The input of a model is the sequence "$\sigma : x_1 = y_1, \dots, x_m = y_m, x =$" * The expected output is "$\sigma : x_1 = y_1, \dots, x_m = y_m, x = y$". * The loss (as well as the accuracy) is calculated only on the answer part $y$ of the whole sequence. Above, by randomly, we mean uniformly. The number of unique example we can gerenate is $n_{\text{train}/\text{val}} = |\mathcal{F}_{\text{train}/\text{val}}| \times s$. ```python from math import factorial def dataset_size(p, q, m, order_matter=True): size_S = p**q assert m+1 <= size_S n = factorial(q) * (factorial(size_S)//factorial(size_S-m-1)) return n//factorial(m+1) if order_matter else n p, q, m = 2, 3, 5 n = dataset_size(p, q, m, order_matter=True) print(n) ``` > 168 If $(p, q, m)$ are small, we could, for a given list of permutation $\mathcal{F}_{\text{train}/\text{val}}$, generate all the possibles $(m+2)$-uplets of $\left(\sigma, \{ x_i, y_i = \sigma(x_i) \}_{i=1}^m, (x, y=\sigma(x)) \right)$ in a list, as a training/validation dataset, and give the list to the (pytorch) dataloader. But I'm afraid this will explode the memory if $(p, q, m)$ become large (example of the code below). So it's better to define our own data iterators where we generate each sample of the fly (online learning), or just `__getitem__(self, idx)` in the dataset class. Note (during training) : In the online approach, the effective size of the dataset is no longer known. But the number of training epochs $n_e$ and training steps $n_s$ remains linked by $n_e = \lfloor \frac{n_s}{L} \rfloor + \mathbb{1}(n_s \% L \ne 0) = \lfloor \frac{n_s+L-1}{L} \rfloor$, with $L = \lfloor \frac{n}{b} \rfloor + \mathbb{1}(n \% b \ne 0) = \lfloor \frac{n+b-1}{b} \rfloor$ the number of batches and $b$ the batch size. One can fix $n_s$ and use $n_e$, or vice versa. ```python import torch from torch.utils.data import Dataset, DataLoader import numpy as np from itertools import permutations, combinations, product from random import sample, choice import random class PermutationDataset(Dataset): def __init__(self, p, q, m, permutations_set, order_matter=True): self.p = p self.q = q self.m = m self.permutations_set = permutations_set self.order_matter=order_matter self.sequences = list(product(range(p), repeat=q)) # Calculate the total number of examples dynamically self.total_examples = self._calculate_total_examples() def _calculate_total_examples(self): size_F = len(self.permutations_set) size_S = len(self.sequences) comb_factor = factorial(size_S) // factorial(size_S - (self.m + 1)) n = size_F * comb_factor return n//factorial(self.m+1) if self.order_matter else n def __len__(self): return self.total_examples def __getitem__(self, idx): # Dynamically generate data # This is a placeholder for the logic to select a permutation and (m+1)-tuples # since generating a specific item by index dynamically without storing is complex # and depends on the approach to mapping indices to specific configurations perm = random.choice(self.permutations_set) seq_samples = random.sample(self.sequences, self.m + 1) x = seq_samples[:-1] # Input sequences y = [tuple(seq[perm[i]] for i in range(self.q)) for seq in x] # Permuted sequences predict_seq = seq_samples[-1] # Sequence to predict y_predict_seq = tuple(predict_seq[perm[i]] for i in range(self.q)) # Expected output # Convert to tensor # This simplification assumes a single example is generated for illustration # In practice, this should handle batched data appropriately perm = torch.tensor(perm, dtype=torch.long) x = torch.tensor(x, dtype=torch.long) y = torch.tensor(y, dtype=torch.long) predict_seq = torch.tensor(predict_seq, dtype=torch.long) y_predict_seq = torch.tensor(y_predict_seq, dtype=torch.long) return perm, x, y, predict_seq, y_predict_seq ``` ```python p, q, m = 2, 4, 10 r = 0.8 n = dataset_size(p, q, m, order_matter=True) print(n) all_perms = list(permutations(range(q))) np.random.shuffle(all_perms) train_size = int(len(all_perms) * r) train_perms = random.sample(all_perms, train_size) val_perms = list(set(all_perms) - set(train_perms)) train_dataset = PermutationDataset(p, q, m, train_perms) val_dataset = PermutationDataset(p, q, m, val_perms) n_train = len(train_dataset) n_test = len(val_dataset) print(n_train, n_test, n_train+n_test) # perm_tensor, x_tensor, y_tensor, predict_seq_tensor, y_predict_seq_tensor = train_dataset.__getitem__(idx=0) # print(perm_tensor) # print(x_tensor) # print(y_tensor) # print(predict_seq_tensor) # print(y_predict_seq_tensor) train_batch_size, val_batch_size = 2, 1 train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True) #val_dataloader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False) for perm_tensor, x_tensor, y_tensor, predict_seq_tensor, y_predict_seq_tensor in train_dataloader : print(perm_tensor) print(x_tensor) print(y_tensor) print(predict_seq_tensor) print(y_predict_seq_tensor) break ```