Source code for mistral_v0_2.lib.example_data

from typing import Any

from datasets import load_dataset
from torch.utils.data import Dataset

[docs] def load_data(splitting: str) -> tuple[list[str], list[int]]: """ Loads the example data, the Yelp review dataset, and returns the text inputs with corresponding labels for either the training or test split. Args: splitting (str): A string indicating which split of the dataset to load. Expected values are 'train' for the training set and any other value will default to loading the test set. Returns: tuple[list[str], list[int]]: A tuple containing two lists: - The first list contains the text of the reviews. - The second list contains the corresponding labels as integers. """ dataset = load_dataset("yelp_review_full") if splitting == 'train': train_input = dataset['train']['text'] train_labels = dataset['train']['label'] return train_input, train_labels test_input = dataset['test']['text'] test_labels = dataset['test']['label'] return test_input, test_labels
class ExampleDataset(Dataset): def __init__(self, splitting: str) -> None: inputs, labels = load_data(splitting) self.data = list(zip(inputs, labels)) super().__init__() def __getitem__(self, index: int) -> Any: return self.data[index] def __len__(self) -> int: return len(self.data)