Spaces:
Sleeping
Sleeping
""" | |
Sep 2024 | |
Ryan Tietjen | |
Contains helper functions to process user input for the demo | |
""" | |
import pandas as pd | |
def split_abstract(abstract): | |
results = [] | |
lines = abstract.split("\n") | |
for i, line in enumerate(lines): | |
entry = { | |
"target": 0, | |
"text": line.lower(), | |
"line_number": i + 1, | |
"total_lines": len(lines) | |
} | |
results.append(entry) | |
return results | |
def split_abstract_original(abstract): | |
results = [] | |
lines = abstract.split("\n") | |
for i, line in enumerate(lines): | |
entry = { | |
"target": 0, | |
"text": line, | |
"line_number": i + 1, | |
"total_lines": len(lines) | |
} | |
results.append(entry) | |
return results | |
def split_sentences_by_characters(corpus): | |
return [" ".join(sentence) for sentence in corpus] | |
def encode_labels(*datasets): | |
""" | |
Encode labels for multiple datasets using a unified label mapping. | |
Args: | |
*datasets: Arbitrary number of array-like structures containing labels. | |
Returns: | |
tuple: Encoded labels as numpy arrays for each dataset. | |
""" | |
# Collect all labels from all datasets into a single list | |
all_labels = pd.concat([pd.Series(data) for data in datasets]) | |
# Get unique labels and sort them to ensure consistency | |
unique_labels = pd.unique(all_labels) | |
unique_labels.sort() | |
# Create mapping from labels to integers | |
label_to_index = {label: idx for idx, label in enumerate(unique_labels)} | |
# Function to encode a single dataset | |
def encode_single_dataset(dataset, mapping): | |
return pd.Series(dataset).map(mapping).to_numpy() | |
# Encode all datasets using the mapping | |
encoded_datasets = tuple(encode_single_dataset(dataset, label_to_index) for dataset in datasets) | |
# Return only the encoded datasets | |
return encoded_datasets |