File size: 2,069 Bytes
f7c8bfc
6bb0951
 
 
 
 
 
4dd54a9
6bb0951
4dd54a9
df6fdec
f3a3c59
df6fdec
f3a3c59
64bc0cd
 
f3a3c59
d9b3a0f
 
558dbba
 
 
 
 
 
 
 
 
 
 
 
cdba87b
 
 
f7c8bfc
 
 
 
 
 
 
 
f2d93bb
f7c8bfc
59b164d
f7c8bfc
 
 
f2d93bb
f7c8bfc
 
 
 
 
 
 
 
 
9922904
f7c8bfc
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer

app = FastAPI()

@app.get("/")
def summarize():
    # Example: Loading a dataset as part of the API
    billsum = load_dataset("billsum", split="ca_test")

#    import pandas as pd

#    df = pd.read_csv("squad_sample_train.tsv", sep="\t")
#    print(df.head())  # Debugging step
#    return {"Hello": "World!", "dataset_length": len(billsum)}
#    return df.head()
    checkpoint = "google-t5/t5-small"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    prefix = "summarize: "


    def preprocess_function(examples):
        inputs = [prefix + doc for doc in examples["text"]]
        model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

        labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_billsum = billsum.map(preprocess_function, batched=True)
    
    return tokenized_billsum """

from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer

app = FastAPI()

#@app.get("/")
# Load dataset and tokenizer
billsum = load_dataset("billsum", split="ca_test")  # Load a small sample
tokenizer = AutoTokenizer.from_pretrained("t5-small")
prefix = "summarize: "  # Example prefix for text generation

@app.get("/")
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

#@app.get("/")
def get_tokenized_data():
    tokenized_billsum = billsum.map(preprocess_function, batched=True)

    # Convert to list of dictionaries
    json_serializable_output = tokenized_billsum.to_pandas().to_dict(orient="records")

    return {"tokenized_data": json_serializable_output}  # Ensure JSON format