Datasets

Reference information for the language data Datasets API.

`eva.language.data.datasets.PubMedQA`

Bases: TextClassification

Dataset class for PubMedQA question answering task.

Parameters:

Name	Type	Description	Default
`root`	`str \| None`	Directory to cache the dataset. If None, no local caching is used.	`None`
`split`	`Literal['train', 'val', 'test'] \| None`	Valid splits among ["train", "val", "test"]. If None, it will use "train+test+validation".	`None`
`download`	`bool`	Whether to download the dataset if not found locally. Default is False.	`False`
`max_samples`	`int \| None`	Maximum number of samples to use. If None, use all samples.	`None`

Source code in src/eva/language/data/datasets/classification/pubmedqa.py

def __init__(
    self,
    root: str | None = None,
    split: Literal["train", "val", "test"] | None = None,
    download: bool = False,
    max_samples: int | None = None,
) -> None:
    """Initialize the PubMedQA dataset.

    Args:
        root: Directory to cache the dataset. If None, no local caching is used.
        split: Valid splits among ["train", "val", "test"].
            If None, it will use "train+test+validation".
        download: Whether to download the dataset if not found locally. Default is False.
        max_samples: Maximum number of samples to use. If None, use all samples.
    """
    super().__init__()

    self._root = root
    self._split = split
    self._download = download
    self._max_samples = max_samples

`prepare_data`

Downloads and prepares the PubMedQA dataset.

If self._root is None, the dataset is used directly from HuggingFace. Otherwise, it checks if the dataset is already cached in self._root. If not cached, it downloads the dataset into self._root.

Source code in src/eva/language/data/datasets/classification/pubmedqa.py

@override
def prepare_data(self) -> None:
    """Downloads and prepares the PubMedQA dataset.

    If `self._root` is None, the dataset is used directly from HuggingFace.
    Otherwise, it checks if the dataset is already cached in `self._root`.
    If not cached, it downloads the dataset into `self._root`.
    """
    dataset_path = None

    if self._root:
        dataset_path = self._root
        os.makedirs(self._root, exist_ok=True)

    try:
        self.dataset = self._load_dataset(dataset_path)
        if self._max_samples is not None and len(self.dataset) > self._max_samples:
            logger.info(
                f"Subsampling dataset from {len(self.dataset)} to {self._max_samples} samples"
            )
            random.seed(42)
            indices = random.sample(range(len(self.dataset)), self._max_samples)
            self.dataset = self.dataset.select(indices)
    except Exception as e:
        raise RuntimeError(f"Failed to prepare dataset: {e}") from e

`eva.language.data.datasets.LanguageDataset`

Bases: MapDataset, ABC, Generic[DataSample]

Base dataset class for text tasks.