Исходный код abs_text_attack.core.interfaces.dataset

# `dataset` must be of type `textattack.datasets.Dataset`. Otherwise we can not perform.
# Хорошо бы добавить фабрику для создания датасетов из разных источников (файлы, API и т.д.)
# Также стоит подумать над реальными методами для работы с датасетом - валидации и предобработкиа данных перед атакой,
# сплит на train/test, получение батчей и т.д.
# Также полезно иметь статистику о датасете - размер, распределение классов и т.п.
import pandas as pd
from pandas import read_csv
import json
from textattack.datasets import HuggingFaceDataset, Dataset

[документация] class DatasetFactory: """ Class that acts like a Factory for constructing TextAttack dataset. """
[документация] @staticmethod def to_ta_dataset(dataset, text_column='text', label_column='label'): """ Function that converts user dataset to TextAttack dataset args: dataset: csv || (HFdataset, split), where csv -- path to csv file. !!!For now, let's assume that it should contain first column with texts and second column with labels\ no headers!!! (HFdataset, split) -- tuple of two, HFdataset -- str of hugging dataset, split -- train/val/test split of HFdataset """ if isinstance(dataset, tuple) and len(dataset) == 2: return DatasetFactory._hf_to_ta(dataset) else: file_name = dataset[0] type_of_dataset = file_name.split('.')[-1] text_column = dataset[1] label_column = dataset[2] if type_of_dataset == 'csv': return DatasetFactory._csv_to_ta(file_name, text_column, label_column) elif type_of_dataset == 'xlsx': return DatasetFactory._excel_to_ta(file_name, text_column, label_column) elif type_of_dataset == 'json': return DatasetFactory._json_to_ta(file_name, text_column, label_column) elif type_of_dataset == 'jsonl': return DatasetFactory._jsonl_to_ta(file_name, text_column, label_column) elif type_of_dataset == 'parquet': return DatasetFactory._parquet_to_ta(file_name, text_column, label_column) else: raise ValueError('The file was not found or the format is not supported. Available formats: \n -csv \n -xlsx \n -json \n -jsonl \n -parquet')
@staticmethod def _csv_to_ta(data, text_column='text', label_column='label'): """ Function that converts csv to TA Dataset. It uses pandas dataframe as T in this ETL pipeline. args: data: str, path for data. text_column: str, text column for model input label_column: str, column with labels """ df = read_csv(data, encoding='utf-8') try: dataset_list = [(row[text_column], row[label_column]) for _, row in df.iterrows()] except: raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns') return Dataset(dataset_list) @staticmethod def _excel_to_ta(data, text_column='text', label_column='label'): """ Function that converts xlsx to TA Dataset. It uses pandas dataframe as T in this ETL pipeline. args: data: str, path for data. text_column: str, text column for model input label_column: str, column with labels """ df = pd.read_excel(data, encoding='utf-8') try: dataset_list = [(row[text_column], row[label_column]) for _, row in df.iterrows()] except: raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns') return Dataset(dataset_list) @staticmethod def _hf_to_ta(data, text_column='text', label_column='label'): """ Function that converts (HFdataset, split) to TA Dataset. args: data: (HFdataset, split) -- tuple of two, HFdataset -- str of hugging dataset, split -- train/val/test split of HFdataset. """ return HuggingFaceDataset(data[0], None, data[1], shuffle=True) @staticmethod def _json_to_ta(data, text_column='text', label_column='label'): """ Function that converts json to TA Dataset. It uses pandas dataframe as T in this ETL pipeline. args: data: str, path for data. text_column: str, text column for model input label_column: str, column with labels """ with open(data, 'r') as file: data = json.load(file) try: tuple_array = [(item[text_column], item[label_column]) for item in data] except: raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns') return Dataset(tuple_array) @staticmethod def _jsonl_to_ta(data, text_column='text', label_column='label'): """ Function that converts jsonl to TA Dataset. It uses pandas dataframe as T in this ETL pipeline. args: data: str, path for data. text_column: str, text column for model input label_column: str, column with labels """ info = [] with open(data, 'r', encoding='utf-8') as file: for line in file: info.append(json.loads(line)) try: tuple_array = [(item[text_column], item[label_column]) for item in info] except: raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns') return Dataset(tuple_array) @staticmethod def _parquet_to_ta(data, text_column='text', label_column='label'): """ Function that converts parquet to TA Dataset. It uses pandas dataframe as T in this ETL pipeline. args: data: str, path for data. text_column: str, text column for model input label_column: str, column with labels """ df = pd.read_parquet(data, engine='pyarrow') try: dataset_list = [(row[text_column], row[label_column]) for _, row in df.iterrows()] except: raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns') return Dataset(dataset_list)