Skip to content

Datasets

Reference information for the language data Datasets API.

eva.language.data.datasets.PubMedQA

Bases: TextClassification

Dataset class for PubMedQA question answering task.

Parameters:

Name Type Description Default
root str | None

Directory to cache the dataset. If None, no local caching is used.

None
split Literal['train', 'val', 'test'] | None

Valid splits among ["train", "val", "test"]. If None, it will use "train+test+validation".

None
download bool

Whether to download the dataset if not found locally. Default is False.

False
max_samples int | None

Maximum number of samples to use. If None, use all samples.

None
Source code in src/eva/language/data/datasets/classification/pubmedqa.py
def __init__(
    self,
    root: str | None = None,
    split: Literal["train", "val", "test"] | None = None,
    download: bool = False,
    max_samples: int | None = None,
) -> None:
    """Initialize the PubMedQA dataset.

    Args:
        root: Directory to cache the dataset. If None, no local caching is used.
        split: Valid splits among ["train", "val", "test"].
            If None, it will use "train+test+validation".
        download: Whether to download the dataset if not found locally. Default is False.
        max_samples: Maximum number of samples to use. If None, use all samples.
    """
    super().__init__()

    self._root = root
    self._split = split
    self._download = download
    self._max_samples = max_samples

prepare_data

Downloads and prepares the PubMedQA dataset.

If self._root is None, the dataset is used directly from HuggingFace. Otherwise, it checks if the dataset is already cached in self._root. If not cached, it downloads the dataset into self._root.

Source code in src/eva/language/data/datasets/classification/pubmedqa.py
@override
def prepare_data(self) -> None:
    """Downloads and prepares the PubMedQA dataset.

    If `self._root` is None, the dataset is used directly from HuggingFace.
    Otherwise, it checks if the dataset is already cached in `self._root`.
    If not cached, it downloads the dataset into `self._root`.
    """
    dataset_path = None

    if self._root:
        dataset_path = self._root
        os.makedirs(self._root, exist_ok=True)

    try:
        self.dataset = self._load_dataset(dataset_path)
        if self._max_samples is not None and len(self.dataset) > self._max_samples:
            logger.info(
                f"Subsampling dataset from {len(self.dataset)} to {self._max_samples} samples"
            )
            random.seed(42)
            indices = random.sample(range(len(self.dataset)), self._max_samples)
            self.dataset = self.dataset.select(indices)
    except Exception as e:
        raise RuntimeError(f"Failed to prepare dataset: {e}") from e

eva.language.data.datasets.LanguageDataset

Bases: MapDataset, ABC, Generic[DataSample]

Base dataset class for text tasks.