File size: 2,286 Bytes
587016d
6bb0951
 
1f93ce8
fa28652
6bb0951
 
 
 
4dd54a9
6bb0951
4dd54a9
df6fdec
f3a3c59
df6fdec
f3a3c59
64bc0cd
 
f3a3c59
d9b3a0f
 
558dbba
 
 
 
 
 
 
 
 
 
 
 
cdba87b
 
1f93ce8
 
fcb2108
 
cdba87b
5d5cebc
1f93ce8
f7c8bfc
587016d
f7c8bfc
 
 
 
 
f2d93bb
f7c8bfc
59b164d
f7c8bfc
 
 
f2d93bb
f7c8bfc
 
 
 
 
 
 
 
 
9922904
f7c8bfc
 
 
 
 
 
587016d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate

app = FastAPI()

@app.get("/")
def summarize():
    # Example: Loading a dataset as part of the API
    billsum = load_dataset("billsum", split="ca_test")

#    import pandas as pd

#    df = pd.read_csv("squad_sample_train.tsv", sep="\t")
#    print(df.head())  # Debugging step
#    return {"Hello": "World!", "dataset_length": len(billsum)}
#    return df.head()
    checkpoint = "google-t5/t5-small"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    prefix = "summarize: "


    def preprocess_function(examples):
        inputs = [prefix + doc for doc in examples["text"]]
        model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

        labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_billsum = billsum.map(preprocess_function, batched=True)

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
   
    rouge = evaluate.load("rouge")
    
    return data_collator
#    return type(tokenized_billsum)

"""from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer

app = FastAPI()

#@app.get("/")
# Load dataset and tokenizer
billsum = load_dataset("billsum", split="ca_test")  # Load a small sample
tokenizer = AutoTokenizer.from_pretrained("t5-small")
prefix = "summarize: "  # Example prefix for text generation

@app.get("/")
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

#@app.get("/")
def get_tokenized_data():
    tokenized_billsum = billsum.map(preprocess_function, batched=True)

    # Convert to list of dictionaries
    json_serializable_output = tokenized_billsum.to_pandas().to_dict(orient="records")

    return {"tokenized_data": json_serializable_output}  # Ensure JSON format"""