Spaces:

rcook
/

humanities_papers

Sleeping

File size: 2,069 Bytes

f7c8bfc
6bb0951
 
 
 
 
 
4dd54a9
6bb0951
4dd54a9
df6fdec
f3a3c59
df6fdec
f3a3c59
64bc0cd
 
f3a3c59
d9b3a0f
 
558dbba
 
 
 
 
 
 
 
 
 
 
 
cdba87b
 
 
f7c8bfc
 
 
 
 
 
 
 
f2d93bb
f7c8bfc
59b164d
f7c8bfc
 
 
f2d93bb
f7c8bfc
 
 
 
 
 
 
 
 
9922904
f7c8bfc

"""from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer

app = FastAPI()

@app.get("/")
def summarize():
    # Example: Loading a dataset as part of the API
    billsum = load_dataset("billsum", split="ca_test")

#    import pandas as pd

#    df = pd.read_csv("squad_sample_train.tsv", sep="\t")
#    print(df.head())  # Debugging step
#    return {"Hello": "World!", "dataset_length": len(billsum)}
#    return df.head()
    checkpoint = "google-t5/t5-small"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    prefix = "summarize: "


    def preprocess_function(examples):
        inputs = [prefix + doc for doc in examples["text"]]
        model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

        labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_billsum = billsum.map(preprocess_function, batched=True)
    
    return tokenized_billsum """

from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer

app = FastAPI()

#@app.get("/")
# Load dataset and tokenizer
billsum = load_dataset("billsum", split="ca_test")  # Load a small sample
tokenizer = AutoTokenizer.from_pretrained("t5-small")
prefix = "summarize: "  # Example prefix for text generation

@app.get("/")
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

#@app.get("/")
def get_tokenized_data():
    tokenized_billsum = billsum.map(preprocess_function, batched=True)

    # Convert to list of dictionaries
    json_serializable_output = tokenized_billsum.to_pandas().to_dict(orient="records")

    return {"tokenized_data": json_serializable_output}  # Ensure JSON format