Исходный код abs_text_attack.core.interfaces.dataset

# `dataset` must be of type `textattack.datasets.Dataset`. Otherwise we can not perform.
# Хорошо бы добавить фабрику для создания датасетов из разных источников (файлы, API и т.д.)
# Также стоит подумать над реальными методами для работы с датасетом - валидации и предобработкиа данных перед атакой,
# сплит на train/test, получение батчей и т.д.
# Также полезно иметь статистику о датасете - размер, распределение классов и т.п.
import pandas as pd
from pandas import read_csv
import json
from textattack.datasets import HuggingFaceDataset, Dataset


[документация]
class DatasetFactory:
    """
    Class that acts like a Factory for constructing TextAttack dataset.
    """

[документация]
    @staticmethod
    def to_ta_dataset(dataset, text_column='text', label_column='label'):
        """
            Function that converts user dataset to TextAttack dataset
        args:
            dataset: csv || (HFdataset, split), where
            csv -- path to csv file. !!!For now, let's assume that it should contain first column with texts and second column with labels\
                no headers!!!   
            (HFdataset, split) -- tuple of two, HFdataset -- str of hugging dataset, split -- train/val/test split of HFdataset
        """
        if isinstance(dataset, tuple) and len(dataset) == 2:
            return DatasetFactory._hf_to_ta(dataset)
        else:
            file_name = dataset[0]
            type_of_dataset = file_name.split('.')[-1]
            text_column = dataset[1]
            label_column = dataset[2]
            if type_of_dataset == 'csv':
                return DatasetFactory._csv_to_ta(file_name, text_column, label_column)
            elif type_of_dataset == 'xlsx':
                return DatasetFactory._excel_to_ta(file_name, text_column, label_column)
            elif type_of_dataset == 'json':
                return DatasetFactory._json_to_ta(file_name, text_column, label_column)
            elif type_of_dataset == 'jsonl':
                return DatasetFactory._jsonl_to_ta(file_name, text_column, label_column)
            elif type_of_dataset == 'parquet':
                return DatasetFactory._parquet_to_ta(file_name, text_column, label_column)
            else:
                raise ValueError('The file was not found or the format is not supported. Available formats: \n -csv \n -xlsx \n -json \n -jsonl \n -parquet')



    @staticmethod
    def _csv_to_ta(data, text_column='text', label_column='label'):
        """
            Function that converts csv to TA Dataset. It uses pandas dataframe as T in this ETL pipeline.
        args:
            data: str, path for data.
            text_column: str, text column for model input
            label_column: str, column with labels
        """
        df = read_csv(data, encoding='utf-8')
        try:
            dataset_list = [(row[text_column], row[label_column]) for _, row in df.iterrows()]
        except:
            raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns') 
        return Dataset(dataset_list)

    @staticmethod
    def _excel_to_ta(data, text_column='text', label_column='label'):
        """
            Function that converts xlsx to TA Dataset. It uses pandas dataframe as T in this ETL pipeline.
        args:
            data: str, path for data.
            text_column: str, text column for model input
            label_column: str, column with labels
        """
        df = pd.read_excel(data, encoding='utf-8')
        try:
            dataset_list = [(row[text_column], row[label_column]) for _, row in df.iterrows()]
        except:
            raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns') 
        return Dataset(dataset_list)
    
    @staticmethod
    def _hf_to_ta(data, text_column='text', label_column='label'):
        """
            Function that converts (HFdataset, split) to TA Dataset.
        args:
            data: (HFdataset, split) -- tuple of two, HFdataset -- str of hugging dataset, split -- train/val/test split of HFdataset.
        """
        return HuggingFaceDataset(data[0], None, data[1], shuffle=True)

    @staticmethod
    def _json_to_ta(data, text_column='text', label_column='label'):
        """
            Function that converts json to TA Dataset. It uses pandas dataframe as T in this ETL pipeline.
        args:   
            data: str, path for data.
            text_column: str, text column for model input
            label_column: str, column with labels
        """
        with open(data, 'r') as file:
            data = json.load(file)

        try:
            tuple_array = [(item[text_column], item[label_column]) for item in data]
        except:
            raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns') 
        return Dataset(tuple_array)

    @staticmethod
    def _jsonl_to_ta(data, text_column='text', label_column='label'):
        """
            Function that converts jsonl to TA Dataset. It uses pandas dataframe as T in this ETL pipeline.
        args:
            data: str, path for data.
            text_column: str, text column for model input
            label_column: str, column with labels
        """
        info = []
        with open(data, 'r', encoding='utf-8') as file:
            for line in file:
                info.append(json.loads(line))

        try:
            tuple_array = [(item[text_column], item[label_column]) for item in info]
        except:
            raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns') 
        return Dataset(tuple_array)

    @staticmethod
    def _parquet_to_ta(data, text_column='text', label_column='label'):
        """
            Function that converts parquet to TA Dataset. It uses pandas dataframe as T in this ETL pipeline.
        args:
            data: str, path for data.
            text_column: str, text column for model input
            label_column: str, column with labels
        """

        df = pd.read_parquet(data, engine='pyarrow')
        
        try:
            dataset_list = [(row[text_column], row[label_column]) for _, row in df.iterrows()]
        except:
            raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns') 
        return Dataset(dataset_list)