Spaces:
Sleeping
Sleeping
"""from fastapi import FastAPI | |
from datasets import load_dataset | |
from transformers import AutoTokenizer | |
app = FastAPI() | |
@app.get("/") | |
def summarize(): | |
# Example: Loading a dataset as part of the API | |
billsum = load_dataset("billsum", split="ca_test") | |
# import pandas as pd | |
# df = pd.read_csv("squad_sample_train.tsv", sep="\t") | |
# print(df.head()) # Debugging step | |
# return {"Hello": "World!", "dataset_length": len(billsum)} | |
# return df.head() | |
checkpoint = "google-t5/t5-small" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
prefix = "summarize: " | |
def preprocess_function(examples): | |
inputs = [prefix + doc for doc in examples["text"]] | |
model_inputs = tokenizer(inputs, max_length=1024, truncation=True) | |
labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
tokenized_billsum = billsum.map(preprocess_function, batched=True) | |
return tokenized_billsum """ | |
from fastapi import FastAPI | |
from datasets import load_dataset | |
from transformers import AutoTokenizer | |
app = FastAPI() | |
#@app.get("/") | |
# Load dataset and tokenizer | |
billsum = load_dataset("billsum", split="ca_test") # Load a small sample | |
tokenizer = AutoTokenizer.from_pretrained("t5-small") | |
prefix = "summarize: " # Example prefix for text generation | |
def preprocess_function(examples): | |
inputs = [prefix + doc for doc in examples["text"]] | |
model_inputs = tokenizer(inputs, max_length=1024, truncation=True) | |
labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
#@app.get("/") | |
def get_tokenized_data(): | |
tokenized_billsum = billsum.map(preprocess_function, batched=True) | |
# Convert to list of dictionaries | |
json_serializable_output = tokenized_billsum.to_pandas().to_dict(orient="records") | |
return {"tokenized_data": json_serializable_output} # Ensure JSON format | |