# `dataset` must be of type `textattack.datasets.Dataset`. Otherwise we can not perform.
# Хорошо бы добавить фабрику для создания датасетов из разных источников (файлы, API и т.д.)
# Также стоит подумать над реальными методами для работы с датасетом - валидации и предобработкиа данных перед атакой,
# сплит на train/test, получение батчей и т.д.
# Также полезно иметь статистику о датасете - размер, распределение классов и т.п.
import pandas as pd
from pandas import read_csv
import json
from textattack.datasets import HuggingFaceDataset, Dataset
[документация]
class DatasetFactory:
"""
Class that acts like a Factory for constructing TextAttack dataset.
"""
[документация]
@staticmethod
def to_ta_dataset(dataset, text_column='text', label_column='label'):
"""
Function that converts user dataset to TextAttack dataset
args:
dataset: csv || (HFdataset, split), where
csv -- path to csv file. !!!For now, let's assume that it should contain first column with texts and second column with labels\
no headers!!!
(HFdataset, split) -- tuple of two, HFdataset -- str of hugging dataset, split -- train/val/test split of HFdataset
"""
if isinstance(dataset, tuple) and len(dataset) == 2:
return DatasetFactory._hf_to_ta(dataset)
else:
file_name = dataset[0]
type_of_dataset = file_name.split('.')[-1]
text_column = dataset[1]
label_column = dataset[2]
if type_of_dataset == 'csv':
return DatasetFactory._csv_to_ta(file_name, text_column, label_column)
elif type_of_dataset == 'xlsx':
return DatasetFactory._excel_to_ta(file_name, text_column, label_column)
elif type_of_dataset == 'json':
return DatasetFactory._json_to_ta(file_name, text_column, label_column)
elif type_of_dataset == 'jsonl':
return DatasetFactory._jsonl_to_ta(file_name, text_column, label_column)
elif type_of_dataset == 'parquet':
return DatasetFactory._parquet_to_ta(file_name, text_column, label_column)
else:
raise ValueError('The file was not found or the format is not supported. Available formats: \n -csv \n -xlsx \n -json \n -jsonl \n -parquet')
@staticmethod
def _csv_to_ta(data, text_column='text', label_column='label'):
"""
Function that converts csv to TA Dataset. It uses pandas dataframe as T in this ETL pipeline.
args:
data: str, path for data.
text_column: str, text column for model input
label_column: str, column with labels
"""
df = read_csv(data, encoding='utf-8')
try:
dataset_list = [(row[text_column], row[label_column]) for _, row in df.iterrows()]
except:
raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns')
return Dataset(dataset_list)
@staticmethod
def _excel_to_ta(data, text_column='text', label_column='label'):
"""
Function that converts xlsx to TA Dataset. It uses pandas dataframe as T in this ETL pipeline.
args:
data: str, path for data.
text_column: str, text column for model input
label_column: str, column with labels
"""
df = pd.read_excel(data, encoding='utf-8')
try:
dataset_list = [(row[text_column], row[label_column]) for _, row in df.iterrows()]
except:
raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns')
return Dataset(dataset_list)
@staticmethod
def _hf_to_ta(data, text_column='text', label_column='label'):
"""
Function that converts (HFdataset, split) to TA Dataset.
args:
data: (HFdataset, split) -- tuple of two, HFdataset -- str of hugging dataset, split -- train/val/test split of HFdataset.
"""
return HuggingFaceDataset(data[0], None, data[1], shuffle=True)
@staticmethod
def _json_to_ta(data, text_column='text', label_column='label'):
"""
Function that converts json to TA Dataset. It uses pandas dataframe as T in this ETL pipeline.
args:
data: str, path for data.
text_column: str, text column for model input
label_column: str, column with labels
"""
with open(data, 'r') as file:
data = json.load(file)
try:
tuple_array = [(item[text_column], item[label_column]) for item in data]
except:
raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns')
return Dataset(tuple_array)
@staticmethod
def _jsonl_to_ta(data, text_column='text', label_column='label'):
"""
Function that converts jsonl to TA Dataset. It uses pandas dataframe as T in this ETL pipeline.
args:
data: str, path for data.
text_column: str, text column for model input
label_column: str, column with labels
"""
info = []
with open(data, 'r', encoding='utf-8') as file:
for line in file:
info.append(json.loads(line))
try:
tuple_array = [(item[text_column], item[label_column]) for item in info]
except:
raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns')
return Dataset(tuple_array)
@staticmethod
def _parquet_to_ta(data, text_column='text', label_column='label'):
"""
Function that converts parquet to TA Dataset. It uses pandas dataframe as T in this ETL pipeline.
args:
data: str, path for data.
text_column: str, text column for model input
label_column: str, column with labels
"""
df = pd.read_parquet(data, engine='pyarrow')
try:
dataset_list = [(row[text_column], row[label_column]) for _, row in df.iterrows()]
except:
raise ValueError(f'Expected "{text_column}" and "{label_column}" in columns')
return Dataset(dataset_list)