Spaces:
Sleeping
Sleeping
File size: 2,286 Bytes
587016d 6bb0951 1f93ce8 fa28652 6bb0951 4dd54a9 6bb0951 4dd54a9 df6fdec f3a3c59 df6fdec f3a3c59 64bc0cd f3a3c59 d9b3a0f 558dbba cdba87b 1f93ce8 fcb2108 cdba87b 5d5cebc 1f93ce8 f7c8bfc 587016d f7c8bfc f2d93bb f7c8bfc 59b164d f7c8bfc f2d93bb f7c8bfc 9922904 f7c8bfc 587016d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
app = FastAPI()
@app.get("/")
def summarize():
# Example: Loading a dataset as part of the API
billsum = load_dataset("billsum", split="ca_test")
# import pandas as pd
# df = pd.read_csv("squad_sample_train.tsv", sep="\t")
# print(df.head()) # Debugging step
# return {"Hello": "World!", "dataset_length": len(billsum)}
# return df.head()
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
prefix = "summarize: "
def preprocess_function(examples):
inputs = [prefix + doc for doc in examples["text"]]
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_billsum = billsum.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
rouge = evaluate.load("rouge")
return data_collator
# return type(tokenized_billsum)
"""from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer
app = FastAPI()
#@app.get("/")
# Load dataset and tokenizer
billsum = load_dataset("billsum", split="ca_test") # Load a small sample
tokenizer = AutoTokenizer.from_pretrained("t5-small")
prefix = "summarize: " # Example prefix for text generation
@app.get("/")
def preprocess_function(examples):
inputs = [prefix + doc for doc in examples["text"]]
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
#@app.get("/")
def get_tokenized_data():
tokenized_billsum = billsum.map(preprocess_function, batched=True)
# Convert to list of dictionaries
json_serializable_output = tokenized_billsum.to_pandas().to_dict(orient="records")
return {"tokenized_data": json_serializable_output} # Ensure JSON format"""
|