Spaces:
Running
Running
File size: 6,044 Bytes
41b743c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import pandas as pd
from utils import loadjson, get_embedding, savepkl, loadpkl
import yaml
def generate_unified_qa_dataset(output_path='data/unified_qa_data.csv',sample_size=60):
"""
Generate a unified question-answering dataset from multiple data sources.
Parameters:
sample_size (int): Number of samples to extract from each dataset
output_path (str): Path to save the output CSV file
Returns:
pandas.DataFrame: The generated unified dataset
"""
# Initialize result DataFrame
df = pd.DataFrame(columns=[
'task_id', 'query', 'ground_truth', 'metric',
'task_description' # Added task description column
])
# Define dataset paths and corresponding task names with descriptions
dataset_configs = [
{
'task_name': 'alpaca_data',
'path': 'data/alpaca_data/alpaca_data.json',
'format': 'json',
'query_fields': ['instruction', 'input'],
'ground_truth_field': 'output',
'metric': 'f1_score',
'task_description': 'The Alpaca dataset is designed for instruction-following tasks, where the model is required to generate coherent and contextually appropriate responses to given instructions or prompts. It focuses on understanding diverse user requests and providing informative and accurate outputs based on those instructions.'
},
{
'task_name': 'GSM8K',
'path': 'data/GSM8K/GSM8K.json',
'format': 'json',
'query_fields': ['instruction', 'input'],
'ground_truth_field': 'answer',
'metric': 'GSM8K',
'task_description': 'The GSM8K dataset is tailored for mathematical problem-solving tasks. It consists of natural language math problems that require the model to comprehend the problem statement, apply the correct mathematical operations, and provide the solution. The primary challenge lies in both parsing complex language and performing accurate calculations.'
},
{
'task_name': 'multi_news',
'path': 'data/multi_news/multi_news.json',
'format': 'json',
'query_fields': ['instruction', 'input'],
'ground_truth_field': 'output',
'metric': 'f1_score',
'task_description': 'The Multi-News dataset is aimed at text summarization tasks. It contains multiple news articles on the same topic, and the model\'s objective is to generate a concise and comprehensive summary that integrates information from all the articles. The challenge is to distill key points while maintaining coherence and avoiding redundancy.'
},
{
'task_name': 'SQUAD',
'path': 'data/SQUAD/SQUAD.parquet',
'format': 'parquet',
'query_field': 'question',
'ground_truth_field': 'answers',
'ground_truth_subfield': 'text',
'ground_truth_index': 0,
'metric': 'f1_score',
'task_description': 'The SQuAD dataset is focused on question-answering tasks, where the model is given a passage of text and needs to extract or generate a precise answer to a question based on the content of the passage. The dataset emphasizes comprehension, retrieval of relevant information, and concise answer generation.'
}
]
# Process each dataset
for config in dataset_configs:
# Load data
if config['format'] == 'json':
data = loadjson(config['path'])[:sample_size]
# Process JSON formatted data
for item in data:
# Construct query text based on configuration
if isinstance(config['query_fields'], list):
query = ''.join([item[field] for field in config['query_fields']])
else:
query = item[config['query_fields']]
# Get ground truth
ground_truth = item[config['ground_truth_field']]
# Add to dataset
new_row = {
'task_id': config['task_name'],
'query': query,
'ground_truth': ground_truth,
'metric': config['metric'],
'task_description': config['task_description'] # Add task description
}
df = df._append(new_row, ignore_index=True)
elif config['format'] == 'parquet':
data = pd.read_parquet(config['path'])[:sample_size]
# Process Parquet formatted data
for item in data.itertuples():
query = getattr(item, config['query_field'])
# Handle complex ground truth structures
if 'ground_truth_subfield' in config:
ground_truth_container = getattr(item, config['ground_truth_field'])
ground_truth = ground_truth_container[config['ground_truth_subfield']][config['ground_truth_index']]
else:
ground_truth = getattr(item, config['ground_truth_field'])
# add to dataset
new_row = {
'task_id': config['task_name'],
'query': query,
'ground_truth': ground_truth,
'metric': config['metric'],
'task_description': config['task_description'] # Add task description
}
df = df._append(new_row, ignore_index=True)
# Save results to CSV
df.to_csv(output_path, index=False)
return df
# Usage example
if __name__ == "__main__":
# Open config file
with open("configs/config.yaml", 'r', encoding='utf-8') as file:
config = yaml.safe_load(file)
# Generate dataset with default sample size
unified_dataset = generate_unified_qa_dataset(config['unified_qa_data_path'])
# Or specify custom sample size
# unified_dataset = generate_unified_qa_dataset(config['unified_qa_data_path'],sample_size=100) |