AC-Angelo93 commited on
Commit
018e46d
·
verified ·
1 Parent(s): 1058a3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +217 -100
app.py CHANGED
@@ -1,108 +1,225 @@
1
- import os
2
  import gradio as gr
 
 
 
3
  import torch
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
-
6
- # If you have a HF token in the Space secrets, uncomment below:
7
- os.environ["HUGGINGFACE_HUB_TOKEN"] = os.getenv("HF_TOKEN", "")
8
-
9
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
-
11
- # Load tokenizer + model with trust_remote_code, and let Transformers shard/auto‐offload if needed.
12
- tokenizer = AutoTokenizer.from_pretrained(
13
- "Fastweb/FastwebMIIA-7B",
14
- use_fast=True,
15
- trust_remote_code=True
16
- )
17
-
18
- model = AutoModelForCausalLM.from_pretrained(
19
- "Fastweb/FastwebMIIA-7B",
20
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
21
- device_map="auto", # let HF accelerate/device_map place layers automatically
22
- trust_remote_code=True
23
- )
24
-
25
- model.eval() # set to eval mode
26
-
27
- def respond(
28
- message: str,
29
- history: list[tuple[str, str]],
30
- system_message: str,
31
- max_tokens: int,
32
- temperature: float,
33
- top_p: float,
34
- ):
35
- """
36
- Build a list of messages in the format the model expects, apply any chat template,
37
- tokenize, generate, and decode. Wrap inference in torch.no_grad() to save memory.
38
- """
39
- # 1) Build the “chat” message list
40
- messages = []
41
- if system_message:
42
- messages.append({"role": "system", "content": system_message})
43
-
44
- for user_msg, bot_msg in history:
45
- if user_msg:
46
- messages.append({"role": "user", "content": user_msg})
47
- if bot_msg:
48
- messages.append({"role": "assistant", "content": bot_msg})
49
-
50
- messages.append({"role": "user", "content": message})
51
-
52
- # 2) Format via the model’s chat template
53
- # Note: many community‐models define `apply_chat_template`.
54
- input_text = tokenizer.apply_chat_template(
55
- messages,
56
- tokenize=False,
57
- add_generation_prompt=True
58
- )
59
- inputs = tokenizer(input_text, return_tensors="pt")
60
- input_ids = inputs.input_ids.to(DEVICE)
61
- attention_mask = inputs.attention_mask.to(DEVICE)
62
-
63
- # 3) Inference under no_grad
64
- with torch.no_grad():
65
- outputs = model.generate(
66
- input_ids=input_ids,
67
- attention_mask=attention_mask,
68
- max_new_tokens=max_tokens,
69
- temperature=temperature,
70
- top_p=top_p,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  do_sample=True,
72
- pad_token_id=tokenizer.eos_token_id,
 
 
73
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- # 4) Skip the prompt tokens and decode only the newly generated tokens
76
- generated_tokens = outputs[0][input_ids.shape[1]:]
77
- response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
78
- return response
79
-
80
-
81
- # Build a Gradio ChatInterface; sliders/textbox for system‐prompt and sampling‐params
82
- chat_interface = gr.ChatInterface(
83
- fn=respond,
84
- title="FastwebMIIA‐7B Chatbot",
85
- description="A simple chat demo using Fastweb/FastwebMIIA‐7B",
86
- # “additional_inputs” become available above the conversation window
87
- additional_inputs=[
88
- gr.Textbox(
89
- value="You are a helpful assistant.",
90
- label="System message (role: system)"
91
- ),
92
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
93
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
94
- gr.Slider(
95
- minimum=0.1,
96
- maximum=1.0,
97
- value=0.9,
98
- step=0.05,
99
- label="Top-p (nucleus sampling)"
100
- ),
101
- ],
102
- # You can tweak CSS or theme here if you like; omitted for brevity.
103
- )
104
 
105
  if __name__ == "__main__":
106
- # On HF Spaces, you often want `share=False` (default). If you need to expose a public URL, set True.
107
- chat_interface.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
3
+ import huggingface_hub
4
+ import os
5
  import torch
6
+
7
+ # --- Configuration ---
8
+ MODEL_ID = "Fastweb/FastwebMIIA-7B"
9
+ HF_TOKEN = os.getenv("HF_TOKEN") # For Hugging Face Spaces, set this as a Secret
10
+
11
+ # Global variable to store the pipeline
12
+ text_generator_pipeline = None
13
+ model_load_error = None
14
+
15
+ # --- Hugging Face Login and Model Loading ---
16
+ def load_model_and_pipeline():
17
+ global text_generator_pipeline, model_load_error
18
+ if text_generator_pipeline is not None:
19
+ return True # Already loaded
20
+
21
+ if not HF_TOKEN:
22
+ model_load_error = "Hugging Face token (HF_TOKEN) not found in Space secrets. Please add it."
23
+ print(f"ERROR: {model_load_error}")
24
+ return False
25
+
26
+ try:
27
+ print(f"Attempting to login to Hugging Face Hub with token...")
28
+ huggingface_hub.login(token=HF_TOKEN)
29
+ print("Login successful.")
30
+
31
+ print(f"Loading tokenizer for {MODEL_ID}...")
32
+ # trust_remote_code is necessary for some models that define custom architectures/code
33
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
34
+ print("Tokenizer loaded.")
35
+
36
+ print(f"Loading model {MODEL_ID}...")
37
+ # For large models, specify dtype and device_map
38
+ # device_map="auto" will try to use GPU if available, otherwise CPU
39
+ # torch_dtype="auto" or torch.bfloat16 (if supported by hardware) can save memory
40
+ # On CPU Spaces (free tier), this will be VERY slow or might OOM.
41
+ # You might need to use quantization (e.g., bitsandbytes) for CPU, but that's more complex.
42
+ model = AutoModelForCausalLM.from_pretrained(
43
+ MODEL_ID,
44
+ trust_remote_code=True,
45
+ torch_dtype="auto", # or torch.bfloat16 if on A10G or similar
46
+ device_map="auto" # "auto" is good for single/multi GPU or CPU fallback
47
+ )
48
+ print("Model loaded.")
49
+
50
+ # MIIA is an instruct/chat model, so text-generation is the appropriate task
51
+ text_generator_pipeline = pipeline(
52
+ "text-generation",
53
+ model=model,
54
+ tokenizer=tokenizer,
55
+ # device=0 if torch.cuda.is_available() else -1 # device_map handles this
56
+ )
57
+ print("Text generation pipeline created successfully.")
58
+ model_load_error = None
59
+ return True
60
+ except Exception as e:
61
+ model_load_error = f"Error loading model/pipeline: {str(e)}. Check model name, token, and Space resources (RAM/GPU)."
62
+ print(f"ERROR: {model_load_error}")
63
+ text_generator_pipeline = None # Ensure it's None on error
64
+ return False
65
+
66
+ # --- Text Analysis Function ---
67
+ def analyze_text(text_input, file_upload, custom_instruction, max_new_tokens, temperature, top_p):
68
+ global text_generator_pipeline, model_load_error
69
+
70
+ if text_generator_pipeline is None:
71
+ if model_load_error:
72
+ return f"Model not loaded. Error: {model_load_error}"
73
+ else:
74
+ return "Model is not loaded. Please ensure HF_TOKEN is set and the Space has enough resources."
75
+
76
+ content_to_analyze = ""
77
+ if file_upload is not None:
78
+ try:
79
+ # file_upload is a TemporaryFileWrapper object, .name gives the path
80
+ with open(file_upload.name, 'r', encoding='utf-8') as f:
81
+ content_to_analyze = f.read()
82
+ if not content_to_analyze.strip() and not text_input.strip(): # if file is empty and no text input
83
+ return "Uploaded file is empty and no direct text input provided. Please provide some text."
84
+ elif not content_to_analyze.strip() and text_input.strip(): # if file empty but text input has content
85
+ content_to_analyze = text_input
86
+ # If file has content, it will be used. If user also typed, file content takes precedence.
87
+ # We could add logic to concatenate or choose, but this is simpler.
88
+
89
+ except Exception as e:
90
+ return f"Error reading uploaded file: {str(e)}"
91
+ elif text_input:
92
+ content_to_analyze = text_input
93
+ else:
94
+ return "Please provide text directly or upload a document."
95
+
96
+ if not content_to_analyze.strip():
97
+ return "Input text is empty."
98
+
99
+ # FastwebMIIA is an instruct model. It expects prompts like Alpaca.
100
+ # Structure:
101
+ # Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
102
+ # ### Instruction:
103
+ # {your instruction}
104
+ # ### Input:
105
+ # {your text}
106
+ # ### Response:
107
+ # {model generates this}
108
+
109
+ prompt = f"""Di seguito è riportata un'istruzione che descrive un task, abbinata a un input che fornisce un contesto più ampio. Scrivi una risposta che completi la richiesta in modo appropriato.
110
+
111
+ ### Istruzione:
112
+ {custom_instruction}
113
+
114
+ ### Input:
115
+ {content_to_analyze}
116
+
117
+ ### Risposta:"""
118
+
119
+ # For English, you might change the preamble:
120
+ # prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
121
+ # ### Instruction:
122
+ # {custom_instruction}
123
+ # ### Input:
124
+ # {content_to_analyze}
125
+ # ### Response:"""
126
+
127
+
128
+ print(f"\n--- Sending to Model ---")
129
+ print(f"Prompt:\n{prompt}")
130
+ print(f"Max New Tokens: {max_new_tokens}, Temperature: {temperature}, Top P: {top_p}")
131
+ print("------------------------\n")
132
+
133
+ try:
134
+ # Note: text-generation pipelines often return the prompt + completion.
135
+ # We might need to strip the prompt from the output if desired.
136
+ generated_outputs = text_generator_pipeline(
137
+ prompt,
138
+ max_new_tokens=int(max_new_tokens),
139
  do_sample=True,
140
+ temperature=float(temperature) if float(temperature) > 0 else 0.7, # temp 0 means greedy
141
+ top_p=float(top_p),
142
+ num_return_sequences=1
143
  )
144
+ response = generated_outputs[0]['generated_text']
145
+
146
+ # Often, the response includes the prompt. Let's try to return only the new part.
147
+ # The model should generate text after "### Risposta:"
148
+ answer_marker = "### Risposta:"
149
+ if answer_marker in response:
150
+ return response.split(answer_marker, 1)[1].strip()
151
+ else:
152
+ # Fallback if the marker isn't found (shouldn't happen with good prompting)
153
+ return response # Or you could try to remove the original prompt string
154
+
155
+ except Exception as e:
156
+ return f"Error during text generation: {str(e)}"
157
+
158
+ # --- Gradio Interface ---
159
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
160
+ gr.Markdown(f"""
161
+ # 📝 Text Analysis with {MODEL_ID}
162
+ Test the capabilities of the `{MODEL_ID}` model for text analysis tasks on Italian or English texts.
163
+ Provide an instruction and your text (directly or via upload).
164
+ **Important:** Model loading can take a few minutes, especially on the first run or on CPU.
165
+ This app is best run on a Hugging Face Space with GPU resources for this model size.
166
+ """)
167
+
168
+ with gr.Row():
169
+ status_textbox = gr.Textbox(label="Model Status", value="Attempting to load model...", interactive=False)
170
+
171
+ with gr.Tab("Text Input & Analysis"):
172
+ with gr.Row():
173
+ with gr.Column(scale=2):
174
+ instruction_prompt = gr.Textbox(
175
+ label="Instruction for the Model (e.g., 'Riassumi questo testo', 'Identify main topics', 'Translate to English')",
176
+ value="Riassumi questo testo in 3 frasi concise.",
177
+ lines=3
178
+ )
179
+ text_area_input = gr.Textbox(label="Enter Text Directly", lines=10, placeholder="Paste your text here...")
180
+ file_input = gr.File(label="Or Upload a Document (.txt)", file_types=['.txt'])
181
+ with gr.Column(scale=3):
182
+ output_text = gr.Textbox(label="Model Output", lines=20, interactive=False)
183
+
184
+ with gr.Accordion("Advanced Generation Parameters", open=False):
185
+ max_new_tokens_slider = gr.Slider(minimum=50, maximum=1024, value=256, step=10, label="Max New Tokens")
186
+ temperature_slider = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature (higher is more creative)")
187
+ top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top P (nucleus sampling)")
188
+
189
+ analyze_button = gr.Button("🧠 Analyze Text", variant="primary")
190
+
191
+ analyze_button.click(
192
+ fn=analyze_text,
193
+ inputs=[text_area_input, file_input, instruction_prompt, max_new_tokens_slider, temperature_slider, top_p_slider],
194
+ outputs=output_text
195
+ )
196
+
197
+ # Load the model when the app starts.
198
+ # This will update the status_textbox after attempting to load.
199
+ def startup_load_model():
200
+ if load_model_and_pipeline():
201
+ return "Model loaded successfully and ready."
202
+ else:
203
+ return f"Failed to load model. Error: {model_load_error or 'Unknown error during startup.'}"
204
+
205
+ demo.load(startup_load_model, outputs=status_textbox)
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  if __name__ == "__main__":
209
+ # For local testing (ensure HF_TOKEN is set as an environment variable or you're logged in via CLI)
210
+ # You would run: HF_TOKEN="your_hf_token_here" python app.py
211
+ # If not set, it will fail unless you've done `huggingface-cli login`
212
+ if not HF_TOKEN and "HF_TOKEN" not in os.environ:
213
+ print("WARNING: HF_TOKEN environment variable not set.")
214
+ print("For local execution, either set HF_TOKEN or ensure you are logged in via 'huggingface-cli login'.")
215
+ # Attempt to use CLI login if available
216
+ try:
217
+ HF_TOKEN = huggingface_hub.HfApi().token
218
+ if HF_TOKEN:
219
+ print("Using token from huggingface-cli login.")
220
+ else:
221
+ print("Could not retrieve token from CLI login. Model access might fail.")
222
+ except Exception as e:
223
+ print(f"Could not check CLI login status: {e}. Model access might fail.")
224
 
225
+ demo.queue().launch(debug=True, share=False) # share=True for public link if local