# Login to HuggingFace (just login once)

In [None]:
from huggingface_hub import interpreter_login
interpreter_login()

# Collect Menu Image Datasets
- Use `metadata.jsonl` to label the images's ground truth. You can visit [here](https://github.com/ryanlinjui/menu-text-detection/tree/main/examples) to see the examples.
- After finishing, push to HuggingFace Datasets.
- For labeling:
    - [Google AI Studio](https://aistudio.google.com) or [OpenAI ChatGPT](https://chatgpt.com).
    - Use function calling by API. Start the gradio app locally or visit [here](https://huggingface.co/spaces/ryanlinjui/menu-text-detection).

### Menu Type
- **h**: horizontal menu
- **v**: vertical menu
- **d**: document-style menu
- **s**: in-scene menu (non-document style)
- **i**: irregular menu (menu with irregular text layout)

> Please see the [examples](https://github.com/ryanlinjui/menu-text-detection/tree/main/examples) for more details.

In [None]:
from datasets import load_dataset

dataset = load_dataset(path="datasets/menu-zh-TW")      # load dataset from the local directory including the metadata.jsonl, images files.
dataset.push_to_hub(repo_id="ryanlinjui/menu-zh-TW")    # push to the huggingface dataset hub

# Setup for Fine-tuning

In [None]:
from datasets import load_dataset
from transformers import DonutProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig

from menu.donut import DonutDatasets

DATASETS_REPO_ID = "ryanlinjui/menu-zh-TW"              # set your dataset repo id for training
PRETRAINED_MODEL_REPO_ID = "naver-clova-ix/donut-base"  # set your pretrained model repo id for fine-tuning
TASK_PROMPT_NAME = "<s_menu-text-detection>"            # set your task prompt name for training
MAX_LENGTH = 1024                                       # set your max length for maximum output length, max to 1536 for donut-base
IMAGE_SIZE = [1280, 960]                                # set your image size for training

raw_datasets = load_dataset(DATASETS_REPO_ID)

# Config: set the model config
config = VisionEncoderDecoderConfig.from_pretrained(PRETRAINED_MODEL_REPO_ID)
config.encoder.image_size = IMAGE_SIZE
config.decoder.max_length = MAX_LENGTH

# Processor: use the processor to process the dataset. 
# Convert the image to the tensor and the text to the token ids.
processor = DonutProcessor.from_pretrained(PRETRAINED_MODEL_REPO_ID)
processor.feature_extractor.size = IMAGE_SIZE[::-1]
processor.feature_extractor.do_align_long_axis = False

# DonutDatasets: use the DonutDatasets to process the dataset.
# For model inpit, the image must be converted to the tensor and the json text must be converted to the token with the task prompt string.
# This example sets the column name by "image" and "menu". So that image file is included in the "image" column and the json text is included in the "menu" column.
datasets = DonutDatasets(
    datasets=raw_datasets,
    processor=processor,
    image_column="image",
    annotation_column="menu",
    task_start_token=TASK_PROMPT_NAME,
    prompt_end_token=TASK_PROMPT_NAME,
    max_length=MAX_LENGTH,
    train_split=0.8,
    validation_split=0.1,
    test_split=0.1,
    sort_json_key=False,
    seed=42,
    shuffle=False
)

# Model: load the pretrained model and set the config.
model = VisionEncoderDecoderModel.from_pretrained(PRETRAINED_MODEL_REPO_ID, config=config)
model.decoder.resize_token_embeddings(len(processor.tokenizer))
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids([TASK_PROMPT_NAME])[0]

# Start Fine-tuning

In [None]:
from functools import reduce

import torch
import numpy as np
from nltk.metrics import edit_distance
from transformers.trainer_utils import EvalPrediction
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

HUGGINGFACE_MODEL_ID = "ryanlinjui/donut-base-finetuned-menu" # set your huggingface model repo id for saving / pushing to the hub
EPOCHS = 100            # set your training epochs
TRAIN_BATCH_SIZE = 1    # set your training batch size
LEARNING_RATE = 3e-5    # set your learning rate
WEIGHT_DECAY = 0.1      # set your weight decay

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using {device} device")
model.to(device)

train_datasets = datasets["train"]
validation_datasets = datasets["validation"]
filtered_tokens = [
    processor.tokenizer.bos_token,
    processor.tokenizer.eos_token,
    processor.tokenizer.pad_token,
    processor.tokenizer.unk_token,
]
def compute_metrics(eval_pred: EvalPrediction) -> dict:
    decoded_preds = processor.tokenizer.batch_decode(eval_pred.predictions, skip_special_tokens=False)

    normed_eds = []
    for idx, pred in enumerate(decoded_preds):
        prediction_sequence = reduce(lambda s, t: s.replace(t, ""), filtered_tokens, pred)
        target_sequence = reduce(lambda s, t: s.replace(t, ""), filtered_tokens, validation_datasets[idx]["target_sequence"])
        ed = edit_distance(prediction_sequence, target_sequence) / max(len(prediction_sequence), len(target_sequence))
        normed_eds.append(ed)

        print(f"[Sample {idx}]")
        print(f"  Prediction: {prediction_sequence}")
        print(f"  Target: {target_sequence}")
        print(f"  Normalized Edit Distance: {ed:.4f}")
        print("-" * 40)

    return {"normed_edit_distance": float(np.mean(normed_eds))}

training_args = Seq2SeqTrainingArguments(
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_eval_batch_size=1,
    output_dir="./.checkpoints",
    seed=42,
    warmup_steps=30,
    eval_strategy="steps",
    eval_steps=200,
    fp16=(device == "cuda"),
    predict_with_generate=True,
    generation_max_length=MAX_LENGTH,
    generation_num_beams=1,
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="steps",
    save_steps=200,
    push_to_hub=True if HUGGINGFACE_MODEL_ID else False,
    hub_model_id=HUGGINGFACE_MODEL_ID,
    hub_strategy="every_save",
    report_to="tensorboard"
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_datasets,
    eval_dataset=validation_datasets,
    processing_class=processor,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.push_to_hub()

In [None]:
from PIL import Image
from transformers import pipeline
from transformers import DonutProcessor

MODEL_REPO_ID = "ryanlinjui/donut-base-finetuned-menu"
TASK_PROMPT_NAME = "<s_menu-text-detection>"
MAX_LENGTH = 1024
IMAGE_SIZE = [1280, 960]

processor = DonutProcessor.from_pretrained(MODEL_REPO_ID)
pipe = pipeline("image-text-to-text", model=MODEL_REPO_ID, processor=processor)
image = Image.open("./examples/menu-hd.jpg")

outputs = pipe(text=TASK_PROMPT_NAME, images=image)[0]["generated_text"]

print(outputs)
print(processor.token2json(outputs))

# Plot the results

In [None]:
# Training Loss
# Validation Normal ED per each epoch 1~0, 1 -> 0.22
# Test Accuracy TED Accuracy, F1 Score Accuracy 0.687058, 0.51119 