|
import os |
|
import google.generativeai as ai |
|
import pandas as pd |
|
import numpy as np |
|
import gradio as gr |
|
import markdown |
|
import time |
|
import boto3 |
|
import json |
|
import math |
|
import string |
|
import re |
|
import spaces |
|
from rapidfuzz import process, fuzz |
|
from tqdm import tqdm |
|
from gradio import Progress |
|
from typing import List, Tuple |
|
from io import StringIO |
|
|
|
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt, verify_titles_prompt, verify_titles_system_prompt |
|
from tools.helper_functions import output_folder, detect_file_type, get_file_name_no_ext, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text |
|
from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL |
|
from tools.llm_api_call import normalise_string, load_in_file, load_in_data_file, load_in_previous_data_files, get_basic_response_data, data_file_to_markdown_table,replace_punctuation_with_underscore, construct_gemini_generative_model, call_aws_claude, send_request, process_requests, clean_markdown_table, clean_column_name, create_unique_table_df_from_reference_table, remove_before_last_term, convert_response_text_to_markdown_table, call_llm_with_markdown_table_checks, write_llm_output_and_logs, convert_reference_table_to_pivot_table, ResponseObject, max_tokens, timeout_wait, number_of_api_retry_attempts, MAX_OUTPUT_VALIDATION_ATTEMPTS, max_time_for_loop, batch_size_default, MAX_COMMENT_CHARS, max_comment_character_length, AWS_DEFAULT_REGION, bedrock_runtime, GradioFileData |
|
|
|
|
|
|
|
def write_llm_output_and_logs_verify(responses: List[ResponseObject], |
|
whole_conversation: List[str], |
|
whole_conversation_metadata: List[str], |
|
file_name: str, |
|
latest_batch_completed: int, |
|
start_row:int, |
|
end_row:int, |
|
model_choice_clean: str, |
|
temperature: float, |
|
log_files_output_paths: List[str], |
|
existing_reference_df:pd.DataFrame, |
|
existing_topics_df:pd.DataFrame, |
|
batch_size_number:int, |
|
in_column:str, |
|
first_run: bool = False) -> None: |
|
""" |
|
Writes the output of the large language model requests and logs to files. |
|
|
|
Parameters: |
|
- responses (List[ResponseObject]): A list of ResponseObject instances containing the text and usage metadata of the responses. |
|
- whole_conversation (List[str]): A list of strings representing the complete conversation including prompts and responses. |
|
- whole_conversation_metadata (List[str]): A list of strings representing metadata about the whole conversation. |
|
- file_name (str): The base part of the output file name. |
|
- latest_batch_completed (int): The index of the current batch. |
|
- start_row (int): Start row of the current batch. |
|
- end_row (int): End row of the current batch. |
|
- model_choice_clean (str): The cleaned model choice string. |
|
- temperature (float): The temperature parameter used in the model. |
|
- log_files_output_paths (List[str]): A list of paths to the log files. |
|
- existing_reference_df (pd.DataFrame): The existing reference dataframe mapping response numbers to topics. |
|
- existing_topics_df (pd.DataFrame): The existing unique topics dataframe |
|
- first_run (bool): A boolean indicating if this is the first run through this function in this process. Defaults to False. |
|
""" |
|
unique_topics_df_out_path = [] |
|
topic_table_out_path = "topic_table_error.csv" |
|
reference_table_out_path = "reference_table_error.csv" |
|
unique_topics_df_out_path = "unique_topic_table_error.csv" |
|
topic_with_response_df = pd.DataFrame() |
|
markdown_table = "" |
|
out_reference_df = pd.DataFrame() |
|
out_unique_topics_df = pd.DataFrame() |
|
batch_file_path_details = "error" |
|
|
|
|
|
is_error = False |
|
|
|
|
|
whole_conversation_str = '\n'.join(whole_conversation) |
|
whole_conversation_metadata_str = '\n'.join(whole_conversation_metadata) |
|
|
|
start_row_reported = start_row + 1 |
|
|
|
|
|
in_column_cleaned = clean_column_name(in_column, max_length=20) |
|
|
|
|
|
file_name = clean_column_name(file_name, max_length=30) |
|
|
|
|
|
batch_file_path_details = f"{file_name}_batch_{latest_batch_completed + 1}_size_{batch_size_number}_col_{in_column_cleaned}" |
|
row_number_string_start = f"Rows {start_row_reported} to {end_row}: " |
|
|
|
whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean + "_temp_" + str(temperature) + ".txt" |
|
whole_conversation_path_meta = output_folder + batch_file_path_details + "_metadata_" + model_choice_clean + "_temp_" + str(temperature) + ".txt" |
|
|
|
with open(whole_conversation_path, "w", encoding='utf-8', errors='replace') as f: |
|
f.write(whole_conversation_str) |
|
|
|
with open(whole_conversation_path_meta, "w", encoding='utf-8', errors='replace') as f: |
|
f.write(whole_conversation_metadata_str) |
|
|
|
|
|
log_files_output_paths.append(whole_conversation_path_meta) |
|
|
|
if isinstance(responses[-1], ResponseObject): response_text = responses[-1].text |
|
elif "choices" in responses[-1]: response_text = responses[-1]["choices"][0]['text'] |
|
else: response_text = responses[-1].text |
|
|
|
|
|
try: |
|
topic_with_response_df, is_error = convert_response_text_to_markdown_table(response_text, table_type="Verify titles table") |
|
except Exception as e: |
|
print("Error in parsing markdown table from response text:", e) |
|
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error |
|
|
|
|
|
topic_with_response_df.columns = ["Response References", "Is this a suitable title", "Explanation", "Alternative title"] |
|
|
|
|
|
|
|
reference_data = [] |
|
|
|
|
|
for index, row in topic_with_response_df.iterrows(): |
|
|
|
references = re.findall(r'\d+', str(row.iloc[0])) if pd.notna(row.iloc[0]) else [] |
|
topic = row.iloc[1] if pd.notna(row.iloc[1]) else "" |
|
summary = row.iloc[2] if pd.notna(row.iloc[2]) else "" |
|
suggested_title = row.iloc[3] if pd.notna(row.iloc[3]) else "" |
|
|
|
|
|
|
|
|
|
for ref in references: |
|
|
|
try: |
|
response_ref_no = str(int(ref) + int(start_row)) |
|
except ValueError: |
|
print("Reference is not a number") |
|
continue |
|
|
|
row_data = { |
|
'Response References': response_ref_no, |
|
'Is this a suitable title': topic, |
|
'Explanation': summary, |
|
"Start row of group": start_row_reported, |
|
"Suggested title": suggested_title |
|
} |
|
|
|
reference_data.append(row_data) |
|
|
|
|
|
new_reference_df = pd.DataFrame(reference_data) |
|
|
|
print("new_reference_df:", new_reference_df) |
|
|
|
|
|
out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all') |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
out_reference_df["Response References"] = out_reference_df["Response References"].astype(int) |
|
except Exception as e: |
|
print("Could not convert Response References column to integer due to", e) |
|
print("out_reference_df['Response References']:", out_reference_df["Response References"].head()) |
|
|
|
out_reference_df.sort_values(["Start row of group", "Response References"], inplace=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
out_unique_topics_df = topic_with_response_df |
|
|
|
topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv" |
|
unique_topics_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv" |
|
reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv" |
|
|
|
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error |
|
|
|
@spaces.GPU |
|
def verify_titles(in_data_file, |
|
file_data:pd.DataFrame, |
|
existing_topics_table:pd.DataFrame, |
|
existing_reference_df:pd.DataFrame, |
|
existing_unique_topics_df:pd.DataFrame, |
|
unique_table_df_display_table_markdown:str, |
|
file_name:str, |
|
num_batches:int, |
|
in_api_key:str, |
|
temperature:float, |
|
chosen_cols:List[str], |
|
model_choice:str, |
|
candidate_topics: GradioFileData = None, |
|
latest_batch_completed:int=0, |
|
out_message:List=[], |
|
out_file_paths:List = [], |
|
log_files_output_paths:List = [], |
|
first_loop_state:bool=False, |
|
whole_conversation_metadata_str:str="", |
|
initial_table_prompt:str=initial_table_prompt, |
|
prompt2:str=prompt2, |
|
prompt3:str=prompt3, |
|
system_prompt:str=system_prompt, |
|
add_existing_topics_system_prompt:str=add_existing_topics_system_prompt, |
|
add_existing_topics_prompt:str=add_existing_topics_prompt, |
|
number_of_prompts_used:int=1, |
|
batch_size:int=50, |
|
context_textbox:str="", |
|
time_taken:float = 0, |
|
sentiment_checkbox:str = "Negative, Neutral, or Positive", |
|
force_zero_shot_radio:str = "No", |
|
in_excel_sheets:List[str] = [], |
|
max_tokens:int=max_tokens, |
|
model_name_map:dict=model_name_map, |
|
max_time_for_loop:int=max_time_for_loop, |
|
progress=Progress(track_tqdm=True)): |
|
|
|
''' |
|
Query an LLM (local, (Gemma 2B Instruct, Gemini or Anthropic-based on AWS) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time. |
|
|
|
Parameters: |
|
- in_data_file (gr.File): Gradio file object containing input data |
|
- file_data (pd.DataFrame): Pandas dataframe containing the consultation response data. |
|
- existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches. |
|
- existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics. |
|
- existing_unique_topics_df (pd.DataFrame): Pandas dataframe containing the unique list of topics, subtopics, sentiment and summaries until this point. |
|
- unique_table_df_display_table_markdown (str): Table for display in markdown format. |
|
- file_name (str): File name of the data file. |
|
- num_batches (int): Number of batches required to go through all the response rows. |
|
- in_api_key (str): The API key for authentication. |
|
- temperature (float): The temperature parameter for the model. |
|
- chosen_cols (List[str]): A list of chosen columns to process. |
|
- candidate_topics (gr.FileData): A Gradio FileData object of existing candidate topics submitted by the user. |
|
- model_choice (str): The choice of model to use. |
|
- latest_batch_completed (int): The index of the latest file completed. |
|
- out_message (list): A list to store output messages. |
|
- out_file_paths (list): A list to store output file paths. |
|
- log_files_output_paths (list): A list to store log file output paths. |
|
- first_loop_state (bool): A flag indicating the first loop state. |
|
- whole_conversation_metadata_str (str): A string to store whole conversation metadata. |
|
- initial_table_prompt (str): The first prompt for the model. |
|
- prompt2 (str): The second prompt for the model. |
|
- prompt3 (str): The third prompt for the model. |
|
- system_prompt (str): The system prompt for the model. |
|
- add_existing_topics_system_prompt (str): The system prompt for the summary part of the model. |
|
- add_existing_topics_prompt (str): The prompt for the model summary. |
|
- number of requests (int): The number of prompts to send to the model. |
|
- batch_size (int): The number of data rows to consider in each request. |
|
- context_textbox (str, optional): A string giving some context to the consultation/task. |
|
- time_taken (float, optional): The amount of time taken to process the responses up until this point. |
|
- sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do? |
|
- force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not. |
|
- in_excel_sheets (List[str], optional): List of excel sheets to load from input file |
|
- max_tokens (int): The maximum number of tokens for the model. |
|
- model_name_map (dict, optional): A dictionary mapping full model name to shortened. |
|
- max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there). |
|
- progress (Progress): A progress tracker. |
|
''' |
|
|
|
tic = time.perf_counter() |
|
model = "" |
|
config = "" |
|
final_time = 0.0 |
|
whole_conversation_metadata = [] |
|
is_error = False |
|
create_revised_general_topics = False |
|
local_model = [] |
|
tokenizer = [] |
|
zero_shot_topics_df = pd.DataFrame() |
|
|
|
|
|
|
|
|
|
|
|
|
|
llama_cpp_prefix = "<start_of_turn>user\n" |
|
llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n" |
|
|
|
|
|
if file_data.empty: |
|
print("No data table found, loading from file") |
|
try: |
|
|
|
in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file) |
|
|
|
file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default, in_excel_sheets) |
|
|
|
except: |
|
|
|
out_message = "Please enter a data file to summarise." |
|
print(out_message) |
|
raise Exception(out_message) |
|
|
|
|
|
|
|
model_choice_clean = model_name_map[model_choice] |
|
|
|
|
|
if first_loop_state==True: |
|
print("This is the first time through the loop, resetting latest_batch_completed to 0") |
|
if (latest_batch_completed == 999) | (latest_batch_completed == 0): |
|
latest_batch_completed = 0 |
|
out_message = [] |
|
out_file_paths = [] |
|
|
|
|
|
if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"): |
|
progress(0.1, "Loading in Gemma 2b model") |
|
local_model, tokenizer = load_model() |
|
print("Local model loaded:", local_model) |
|
|
|
if num_batches > 0: |
|
progress_measure = round(latest_batch_completed / num_batches, 1) |
|
progress(progress_measure, desc="Querying large language model") |
|
else: |
|
progress(0.1, desc="Querying large language model") |
|
|
|
if latest_batch_completed < num_batches: |
|
|
|
|
|
|
|
if isinstance(out_message, str): |
|
out_message = [out_message] |
|
|
|
if not out_file_paths: |
|
out_file_paths = [] |
|
|
|
|
|
if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300: |
|
out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead." |
|
print(out_message) |
|
raise Exception(out_message) |
|
|
|
|
|
if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive" |
|
elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive" |
|
elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "Create a third column containing only the text 'Not assessed'" |
|
else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive" |
|
|
|
topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)." |
|
topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining") |
|
|
|
for i in topics_loop: |
|
|
|
reported_batch_no = latest_batch_completed + 1 |
|
print("Running query batch", str(reported_batch_no)) |
|
|
|
print("batch_size:", batch_size) |
|
|
|
|
|
simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size, verify_titles=True) |
|
|
|
|
|
|
|
conversation_history = [] |
|
|
|
print("normalised_simple_markdown_table:", normalised_simple_markdown_table) |
|
|
|
|
|
if not batch_basic_response_df.empty: |
|
|
|
|
|
if latest_batch_completed >= 1 or candidate_topics is not None: |
|
|
|
|
|
if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]: |
|
print("Using Gemini model:", model_choice) |
|
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens) |
|
elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]: |
|
print("Using AWS Bedrock model:", model_choice) |
|
else: |
|
print("Using local model:", model_choice) |
|
|
|
|
|
|
|
formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols[0]) |
|
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table) |
|
|
|
print("formatted_summary_prompt:", formatted_summary_prompt) |
|
|
|
|
|
if model_choice == "gemma_2b_it_local": |
|
formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix |
|
full_prompt = formatted_summary_prompt |
|
else: |
|
full_prompt = formatted_system_prompt + formatted_summary_prompt |
|
|
|
|
|
|
|
|
|
formatted_prompt_output_path = output_folder + file_name + "_" + str(reported_batch_no) + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt" |
|
|
|
|
|
try: |
|
with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f: |
|
f.write(full_prompt) |
|
except Exception as e: |
|
print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}") |
|
|
|
if model_choice == "gemma_2b_it_local": |
|
summary_prompt_list = [full_prompt] |
|
else: |
|
summary_prompt_list = [formatted_summary_prompt] |
|
|
|
|
|
|
|
|
|
summary_conversation_history = [] |
|
summary_whole_conversation = [] |
|
|
|
|
|
|
|
|
|
responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True) |
|
|
|
|
|
|
|
|
|
topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs_verify(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False) |
|
|
|
|
|
try: |
|
final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt" |
|
|
|
if isinstance(responses[-1], ResponseObject): |
|
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f: |
|
f.write(responses[-1].text) |
|
elif "choices" in responses[-1]: |
|
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f: |
|
f.write(responses[-1]["choices"][0]['text']) |
|
else: |
|
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f: |
|
f.write(responses[-1].text) |
|
|
|
except Exception as e: |
|
print("Error in returning model response:", e) |
|
|
|
|
|
if is_error == True: |
|
final_message_out = "Could not complete summary, error in LLM output." |
|
raise Exception(final_message_out) |
|
|
|
|
|
|
|
|
|
new_topic_df.to_csv(topic_table_out_path, index=None) |
|
log_files_output_paths.append(topic_table_out_path) |
|
|
|
|
|
new_reference_df.to_csv(reference_table_out_path, index=None) |
|
out_file_paths.append(reference_table_out_path) |
|
|
|
|
|
new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]) |
|
|
|
new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None) |
|
out_file_paths.append(unique_topics_df_out_path) |
|
|
|
|
|
unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500))) |
|
unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False) |
|
|
|
|
|
whole_conversation_metadata_str = ' '.join(whole_conversation_metadata) |
|
|
|
|
|
|
|
|
|
|
|
out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col] |
|
log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col] |
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]: |
|
print("Using Gemini model:", model_choice) |
|
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens) |
|
elif model_choice in ["gemma_2b_it_local"]: |
|
print("Using local Gemma 2b model") |
|
else: |
|
print("Using AWS Bedrock model:", model_choice) |
|
|
|
formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols) |
|
|
|
formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table) |
|
|
|
if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table) |
|
else: formatted_prompt2 = prompt2 |
|
|
|
if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table) |
|
else: formatted_prompt3 = prompt3 |
|
|
|
if model_choice == "gemma_2b_it_local": |
|
formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix |
|
formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix |
|
formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix |
|
|
|
batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] |
|
|
|
whole_conversation = [formatted_initial_table_system_prompt] |
|
|
|
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS) |
|
|
|
|
|
topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_table_df, markdown_table, reference_df, new_unique_topics_df, batch_file_path_details, is_error = write_llm_output_and_logs_verify(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=True) |
|
|
|
|
|
if is_error == True: |
|
raise Exception("Error in output table parsing") |
|
|
|
|
|
|
|
|
|
|
|
topic_table_df.to_csv(topic_table_out_path, index=None) |
|
out_file_paths.append(topic_table_out_path) |
|
|
|
reference_df.to_csv(reference_table_out_path, index=None) |
|
out_file_paths.append(reference_table_out_path) |
|
|
|
|
|
|
|
new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]) |
|
|
|
new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None) |
|
out_file_paths.append(unique_topics_df_out_path) |
|
|
|
|
|
|
|
whole_conversation_metadata.append(whole_conversation_metadata_str) |
|
whole_conversation_metadata_str = '. '.join(whole_conversation_metadata) |
|
|
|
|
|
try: |
|
final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt" |
|
|
|
if isinstance(responses[-1], ResponseObject): |
|
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f: |
|
f.write(responses[-1].text) |
|
unique_table_df_display_table_markdown = responses[-1].text |
|
elif "choices" in responses[-1]: |
|
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f: |
|
f.write(responses[-1]["choices"][0]['text']) |
|
unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text'] |
|
else: |
|
with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f: |
|
f.write(responses[-1].text) |
|
unique_table_df_display_table_markdown = responses[-1].text |
|
|
|
log_files_output_paths.append(final_table_output_path) |
|
|
|
except Exception as e: |
|
print("Error in returning model response:", e) |
|
|
|
new_topic_df = topic_table_df |
|
new_reference_df = reference_df |
|
|
|
else: |
|
print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row) |
|
|
|
|
|
if latest_batch_completed <= num_batches: |
|
print("Completed batch number:", str(reported_batch_no)) |
|
latest_batch_completed += 1 |
|
|
|
toc = time.perf_counter() |
|
final_time = toc - tic |
|
|
|
if final_time > max_time_for_loop: |
|
print("Max time reached, breaking loop.") |
|
topics_loop.close() |
|
tqdm._instances.clear() |
|
break |
|
|
|
|
|
existing_reference_df = new_reference_df.dropna(how='all') |
|
existing_unique_topics_df = new_unique_topics_df.dropna(how='all') |
|
existing_topics_table = new_topic_df.dropna(how='all') |
|
|
|
|
|
modifiable_unique_topics_df = existing_unique_topics_df |
|
|
|
out_time = f"{final_time:0.1f} seconds." |
|
|
|
out_message.append('All queries successfully completed in') |
|
|
|
final_message_out = '\n'.join(out_message) |
|
final_message_out = final_message_out + " " + out_time |
|
|
|
print(final_message_out) |
|
|
|
|
|
if latest_batch_completed >= num_batches: |
|
print("Last batch reached, returning batch:", str(latest_batch_completed)) |
|
|
|
|
|
|
|
toc = time.perf_counter() |
|
final_time = (toc - tic) + time_taken |
|
out_time = f"Everything finished in {round(final_time,1)} seconds." |
|
print(out_time) |
|
|
|
print("All summaries completed. Creating outputs.") |
|
|
|
model_choice_clean = model_name_map[model_choice] |
|
|
|
in_column_cleaned = clean_column_name(chosen_cols, max_length=20) |
|
|
|
|
|
file_name = clean_column_name(file_name, max_length=30) |
|
|
|
|
|
file_path_details = f"{file_name}_col_{in_column_cleaned}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv" |
|
unique_topics_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv" |
|
basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv" |
|
|
|
|
|
existing_reference_df.to_csv(reference_table_out_path, index=None) |
|
out_file_paths.append(reference_table_out_path) |
|
|
|
|
|
final_out_unique_topics_df = existing_unique_topics_df |
|
|
|
|
|
final_out_unique_topics_df.to_csv(unique_topics_df_out_path, index=None) |
|
out_file_paths.append(unique_topics_df_out_path) |
|
|
|
|
|
out_file_paths = [x for x in out_file_paths if '_final_' in x] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
basic_response_data = get_basic_response_data(file_data, chosen_cols, verify_titles=True) |
|
|
|
|
|
pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None) |
|
log_files_output_paths.append(basic_response_data_out_path) |
|
|
|
|
|
missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())] |
|
|
|
|
|
missing_df = pd.DataFrame(columns=existing_reference_df.columns) |
|
|
|
|
|
missing_df['Response References'] = missing_references['Reference'] |
|
missing_df = missing_df.fillna(np.nan) |
|
|
|
|
|
|
|
|
|
missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean + "_temp_" + str(temperature) + ".csv" |
|
missing_df.to_csv(missing_df_out_path, index=None) |
|
log_files_output_paths.append(missing_df_out_path) |
|
|
|
out_file_paths = list(set(out_file_paths)) |
|
log_files_output_paths = list(set(log_files_output_paths)) |
|
|
|
final_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path] |
|
|
|
|
|
modifiable_unique_topics_df = final_out_unique_topics_df |
|
|
|
print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed) |
|
|
|
return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths |
|
|
|
|
|
return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths |