Spaces:

seanpedrickcase
/

llm_topic_modelling

Running

App Files Files Community

llm_topic_modelling / tools /verify_titles.py

seanpedrickcase

Some bug fixes. Added capability to verify text titles

a2c66ec 2 months ago

raw

history blame contribute delete

43.9 kB

	import os
	import google.generativeai as ai
	import pandas as pd
	import numpy as np
	import gradio as gr
	import markdown
	import time
	import boto3
	import json
	import math
	import string
	import re
	import spaces
	from rapidfuzz import process, fuzz
	from tqdm import tqdm
	from gradio import Progress
	from typing import List, Tuple
	from io import StringIO

	from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt, verify_titles_prompt, verify_titles_system_prompt
	from tools.helper_functions import output_folder, detect_file_type, get_file_name_no_ext, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
	from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
	from tools.llm_api_call import normalise_string, load_in_file, load_in_data_file, load_in_previous_data_files, get_basic_response_data, data_file_to_markdown_table,replace_punctuation_with_underscore, construct_gemini_generative_model, call_aws_claude, send_request, process_requests, clean_markdown_table, clean_column_name, create_unique_table_df_from_reference_table, remove_before_last_term, convert_response_text_to_markdown_table, call_llm_with_markdown_table_checks, write_llm_output_and_logs, convert_reference_table_to_pivot_table, ResponseObject, max_tokens, timeout_wait, number_of_api_retry_attempts, MAX_OUTPUT_VALIDATION_ATTEMPTS, max_time_for_loop, batch_size_default, MAX_COMMENT_CHARS, max_comment_character_length, AWS_DEFAULT_REGION, bedrock_runtime, GradioFileData

	#

	def write_llm_output_and_logs_verify(responses: List[ResponseObject],
	whole_conversation: List[str],
	whole_conversation_metadata: List[str],
	file_name: str,
	latest_batch_completed: int,
	start_row:int,
	end_row:int,
	model_choice_clean: str,
	temperature: float,
	log_files_output_paths: List[str],
	existing_reference_df:pd.DataFrame,
	existing_topics_df:pd.DataFrame,
	batch_size_number:int,
	in_column:str,
	first_run: bool = False) -> None:
	"""
	Writes the output of the large language model requests and logs to files.

	Parameters:
	- responses (List[ResponseObject]): A list of ResponseObject instances containing the text and usage metadata of the responses.
	- whole_conversation (List[str]): A list of strings representing the complete conversation including prompts and responses.
	- whole_conversation_metadata (List[str]): A list of strings representing metadata about the whole conversation.
	- file_name (str): The base part of the output file name.
	- latest_batch_completed (int): The index of the current batch.
	- start_row (int): Start row of the current batch.
	- end_row (int): End row of the current batch.
	- model_choice_clean (str): The cleaned model choice string.
	- temperature (float): The temperature parameter used in the model.
	- log_files_output_paths (List[str]): A list of paths to the log files.
	- existing_reference_df (pd.DataFrame): The existing reference dataframe mapping response numbers to topics.
	- existing_topics_df (pd.DataFrame): The existing unique topics dataframe
	- first_run (bool): A boolean indicating if this is the first run through this function in this process. Defaults to False.
	"""
	unique_topics_df_out_path = []
	topic_table_out_path = "topic_table_error.csv"
	reference_table_out_path = "reference_table_error.csv"
	unique_topics_df_out_path = "unique_topic_table_error.csv"
	topic_with_response_df = pd.DataFrame()
	markdown_table = ""
	out_reference_df = pd.DataFrame()
	out_unique_topics_df = pd.DataFrame()
	batch_file_path_details = "error"

	# If there was an error in parsing, return boolean saying error
	is_error = False

	# Convert conversation to string and add to log outputs
	whole_conversation_str = '\n'.join(whole_conversation)
	whole_conversation_metadata_str = '\n'.join(whole_conversation_metadata)

	start_row_reported = start_row + 1

	# Example usage
	in_column_cleaned = clean_column_name(in_column, max_length=20)

	# Need to reduce output file names as full length files may be too long
	file_name = clean_column_name(file_name, max_length=30)

	# Save outputs for each batch. If master file created, label file as master
	batch_file_path_details = f"{file_name}_batch_{latest_batch_completed + 1}_size_{batch_size_number}_col_{in_column_cleaned}"
	row_number_string_start = f"Rows {start_row_reported} to {end_row}: "

	whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
	whole_conversation_path_meta = output_folder + batch_file_path_details + "_metadata_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"

	with open(whole_conversation_path, "w", encoding='utf-8', errors='replace') as f:
	f.write(whole_conversation_str)

	with open(whole_conversation_path_meta, "w", encoding='utf-8', errors='replace') as f:
	f.write(whole_conversation_metadata_str)

	#log_files_output_paths.append(whole_conversation_path)
	log_files_output_paths.append(whole_conversation_path_meta)

	if isinstance(responses[-1], ResponseObject): response_text = responses[-1].text
	elif "choices" in responses[-1]: response_text = responses[-1]["choices"][0]['text']
	else: response_text = responses[-1].text

	# Convert response text to a markdown table
	try:
	topic_with_response_df, is_error = convert_response_text_to_markdown_table(response_text, table_type="Verify titles table")
	except Exception as e:
	print("Error in parsing markdown table from response text:", e)
	return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error

	# Rename columns to ensure consistent use of data frames later in code
	topic_with_response_df.columns = ["Response References", "Is this a suitable title", "Explanation", "Alternative title"]


	# # Table to map references to topics
	reference_data = []

	# Iterate through each row in the original DataFrame
	for index, row in topic_with_response_df.iterrows():
	#references = re.split(r',\s*\|\s+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else ""
	references = re.findall(r'\d+', str(row.iloc[0])) if pd.notna(row.iloc[0]) else []
	topic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
	summary = row.iloc[2] if pd.notna(row.iloc[2]) else ""
	suggested_title = row.iloc[3] if pd.notna(row.iloc[3]) else ""

	#summary = row_number_string_start + summary

	# Create a new entry for each reference number
	for ref in references:
	# Add start_row back onto reference_number
	try:
	response_ref_no = str(int(ref) + int(start_row))
	except ValueError:
	print("Reference is not a number")
	continue

	row_data = {
	'Response References': response_ref_no,
	'Is this a suitable title': topic,
	'Explanation': summary,
	"Start row of group": start_row_reported,
	"Suggested title": suggested_title
	}

	reference_data.append(row_data)

	# Create a new DataFrame from the reference data
	new_reference_df = pd.DataFrame(reference_data)

	print("new_reference_df:", new_reference_df)

	# Append on old reference data
	out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')

	# # Remove duplicate Response References for the same topic
	# out_reference_df.drop_duplicates(["Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)

	# Try converting response references column to int, keep as string if fails
	try:
	out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
	except Exception as e:
	print("Could not convert Response References column to integer due to", e)
	print("out_reference_df['Response References']:", out_reference_df["Response References"].head())

	out_reference_df.sort_values(["Start row of group", "Response References"], inplace=True)

	# # Each topic should only be associated with each individual response once
	# out_reference_df.drop_duplicates(["Response References", "General Topic", "Subtopic", "Sentiment"], inplace=True)

	# # Save the new DataFrame to CSV
	# reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"

	# # Table of all unique topics with descriptions
	# #print("topic_with_response_df:", topic_with_response_df)
	# new_unique_topics_df = topic_with_response_df[["General Topic", "Subtopic", "Sentiment"]]

	# new_unique_topics_df = new_unique_topics_df.rename(columns={new_unique_topics_df.columns[0]: "General Topic", new_unique_topics_df.columns[1]: "Subtopic", new_unique_topics_df.columns[2]: "Sentiment"})

	# # Join existing and new unique topics
	# out_unique_topics_df = pd.concat([new_unique_topics_df, existing_topics_df]).dropna(how='all')

	# out_unique_topics_df = out_unique_topics_df.rename(columns={out_unique_topics_df.columns[0]: "General Topic", out_unique_topics_df.columns[1]: "Subtopic", out_unique_topics_df.columns[2]: "Sentiment"})

	# out_unique_topics_df = out_unique_topics_df.drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).\
	# drop(["Response References", "Summary"], axis = 1, errors="ignore")

	# # Get count of rows that refer to particular topics
	# reference_counts = out_reference_df.groupby(["General Topic", "Subtopic", "Sentiment"]).agg({
	# 'Response References': 'size', # Count the number of references
	# 'Summary': ' <br> '.join
	# }).reset_index()

	# # Join the counts to existing_unique_topics_df
	# out_unique_topics_df = out_unique_topics_df.merge(reference_counts, how='left', on=["General Topic", "Subtopic", "Sentiment"]).sort_values("Response References", ascending=False)

	#out_reference_df = topic_with_response_df
	out_unique_topics_df = topic_with_response_df

	topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
	unique_topics_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
	reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"

	return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error

	@spaces.GPU
	def verify_titles(in_data_file,
	file_data:pd.DataFrame,
	existing_topics_table:pd.DataFrame,
	existing_reference_df:pd.DataFrame,
	existing_unique_topics_df:pd.DataFrame,
	unique_table_df_display_table_markdown:str,
	file_name:str,
	num_batches:int,
	in_api_key:str,
	temperature:float,
	chosen_cols:List[str],
	model_choice:str,
	candidate_topics: GradioFileData = None,
	latest_batch_completed:int=0,
	out_message:List=[],
	out_file_paths:List = [],
	log_files_output_paths:List = [],
	first_loop_state:bool=False,
	whole_conversation_metadata_str:str="",
	initial_table_prompt:str=initial_table_prompt,
	prompt2:str=prompt2,
	prompt3:str=prompt3,
	system_prompt:str=system_prompt,
	add_existing_topics_system_prompt:str=add_existing_topics_system_prompt,
	add_existing_topics_prompt:str=add_existing_topics_prompt,
	number_of_prompts_used:int=1,
	batch_size:int=50,
	context_textbox:str="",
	time_taken:float = 0,
	sentiment_checkbox:str = "Negative, Neutral, or Positive",
	force_zero_shot_radio:str = "No",
	in_excel_sheets:List[str] = [],
	max_tokens:int=max_tokens,
	model_name_map:dict=model_name_map,
	max_time_for_loop:int=max_time_for_loop,
	progress=Progress(track_tqdm=True)):

	'''
	Query an LLM (local, (Gemma 2B Instruct, Gemini or Anthropic-based on AWS) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.

	Parameters:
	- in_data_file (gr.File): Gradio file object containing input data
	- file_data (pd.DataFrame): Pandas dataframe containing the consultation response data.
	- existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches.
	- existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics.
	- existing_unique_topics_df (pd.DataFrame): Pandas dataframe containing the unique list of topics, subtopics, sentiment and summaries until this point.
	- unique_table_df_display_table_markdown (str): Table for display in markdown format.
	- file_name (str): File name of the data file.
	- num_batches (int): Number of batches required to go through all the response rows.
	- in_api_key (str): The API key for authentication.
	- temperature (float): The temperature parameter for the model.
	- chosen_cols (List[str]): A list of chosen columns to process.
	- candidate_topics (gr.FileData): A Gradio FileData object of existing candidate topics submitted by the user.
	- model_choice (str): The choice of model to use.
	- latest_batch_completed (int): The index of the latest file completed.
	- out_message (list): A list to store output messages.
	- out_file_paths (list): A list to store output file paths.
	- log_files_output_paths (list): A list to store log file output paths.
	- first_loop_state (bool): A flag indicating the first loop state.
	- whole_conversation_metadata_str (str): A string to store whole conversation metadata.
	- initial_table_prompt (str): The first prompt for the model.
	- prompt2 (str): The second prompt for the model.
	- prompt3 (str): The third prompt for the model.
	- system_prompt (str): The system prompt for the model.
	- add_existing_topics_system_prompt (str): The system prompt for the summary part of the model.
	- add_existing_topics_prompt (str): The prompt for the model summary.
	- number of requests (int): The number of prompts to send to the model.
	- batch_size (int): The number of data rows to consider in each request.
	- context_textbox (str, optional): A string giving some context to the consultation/task.
	- time_taken (float, optional): The amount of time taken to process the responses up until this point.
	- sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do?
	- force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
	- in_excel_sheets (List[str], optional): List of excel sheets to load from input file
	- max_tokens (int): The maximum number of tokens for the model.
	- model_name_map (dict, optional): A dictionary mapping full model name to shortened.
	- max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
	- progress (Progress): A progress tracker.
	'''

	tic = time.perf_counter()
	model = ""
	config = ""
	final_time = 0.0
	whole_conversation_metadata = []
	is_error = False
	create_revised_general_topics = False
	local_model = []
	tokenizer = []
	zero_shot_topics_df = pd.DataFrame()
	#llama_system_prefix = "<\|start_header_id\|>system<\|end_header_id\|>\n" #"<start_of_turn>user\n"
	#llama_system_suffix = "<\|eot_id\|>" #"<end_of_turn>\n<start_of_turn>model\n"
	#llama_cpp_prefix = "<\|start_header_id\|>system<\|end_header_id\|>\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.<\|eot_id\|><\|start_header_id\|>user<\|end_header_id\|>\n" #"<start_of_turn>user\n"
	#llama_cpp_suffix = "<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>\n" #"<end_of_turn>\n<start_of_turn>model\n"
	#llama_cpp_prefix = "<\|user\|>\n" # This is for phi 3.5
	#llama_cpp_suffix = "<\|end\|>\n<\|assistant\|>" # This is for phi 3.5
	llama_cpp_prefix = "<start_of_turn>user\n"
	llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"

	# If you have a file input but no file data it hasn't yet been loaded. Load it here.
	if file_data.empty:
	print("No data table found, loading from file")
	try:
	#print("in_data_file:", in_data_file)
	in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
	#print("in_colnames:", in_colnames_drop)
	file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default, in_excel_sheets)
	#print("file_data loaded in:", file_data)
	except:
	# Check if files and text exist
	out_message = "Please enter a data file to summarise."
	print(out_message)
	raise Exception(out_message)


	#model_choice_clean = replace_punctuation_with_underscore(model_choice)
	model_choice_clean = model_name_map[model_choice]

	# If this is the first time around, set variables to 0/blank
	if first_loop_state==True:
	print("This is the first time through the loop, resetting latest_batch_completed to 0")
	if (latest_batch_completed == 999) \| (latest_batch_completed == 0):
	latest_batch_completed = 0
	out_message = []
	out_file_paths = []
	#print("model_choice_clean:", model_choice_clean)

	if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
	progress(0.1, "Loading in Gemma 2b model")
	local_model, tokenizer = load_model()
	print("Local model loaded:", local_model)

	if num_batches > 0:
	progress_measure = round(latest_batch_completed / num_batches, 1)
	progress(progress_measure, desc="Querying large language model")
	else:
	progress(0.1, desc="Querying large language model")

	if latest_batch_completed < num_batches:

	# Load file
	# If out message or out_file_paths are blank, change to a list so it can be appended to
	if isinstance(out_message, str):
	out_message = [out_message]

	if not out_file_paths:
	out_file_paths = []


	if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
	out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
	print(out_message)
	raise Exception(out_message)


	if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
	elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
	elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "Create a third column containing only the text 'Not assessed'"
	else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"

	topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
	topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")

	for i in topics_loop:
	#for latest_batch_completed in range(num_batches):
	reported_batch_no = latest_batch_completed + 1
	print("Running query batch", str(reported_batch_no))

	print("batch_size:", batch_size)

	# Call the function to prepare the input table
	simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size, verify_titles=True)
	#log_files_output_paths.append(simplified_csv_table_path)

	# Conversation history
	conversation_history = []

	print("normalised_simple_markdown_table:", normalised_simple_markdown_table)

	# If the latest batch of responses contains at least one instance of text
	if not batch_basic_response_df.empty:

	# If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
	if latest_batch_completed >= 1 or candidate_topics is not None:

	# Prepare Gemini models before query
	if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
	print("Using Gemini model:", model_choice)
	model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
	elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
	print("Using AWS Bedrock model:", model_choice)
	else:
	print("Using local model:", model_choice)


	# Format the summary prompt with the response table and topics
	formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols[0])
	formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table)

	print("formatted_summary_prompt:", formatted_summary_prompt)


	if model_choice == "gemma_2b_it_local":
	formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
	full_prompt = formatted_summary_prompt
	else:
	full_prompt = formatted_system_prompt + formatted_summary_prompt

	#latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)

	# Define the output file path for the formatted prompt
	formatted_prompt_output_path = output_folder + file_name + "_" + str(reported_batch_no) + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"

	# Write the formatted prompt to the specified file
	try:
	with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
	f.write(full_prompt)
	except Exception as e:
	print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")

	if model_choice == "gemma_2b_it_local":
	summary_prompt_list = [full_prompt] # Includes system prompt
	else:
	summary_prompt_list = [formatted_summary_prompt]


	# print("master_summary_prompt_list:", summary_prompt_list[0])

	summary_conversation_history = []
	summary_whole_conversation = []

	# Process requests to large language model
	# responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)

	responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)

	# print("responses:", responses[-1].text)
	# print("Whole conversation metadata:", whole_conversation_metadata)

	topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs_verify(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)

	# Write final output to text file for logging purposes
	try:
	final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"

	if isinstance(responses[-1], ResponseObject):
	with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
	f.write(responses[-1].text)
	elif "choices" in responses[-1]:
	with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
	f.write(responses[-1]["choices"][0]['text'])
	else:
	with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
	f.write(responses[-1].text)

	except Exception as e:
	print("Error in returning model response:", e)

	# If error in table parsing, leave function
	if is_error == True:
	final_message_out = "Could not complete summary, error in LLM output."
	raise Exception(final_message_out)
	#return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out

	# Write outputs to csv
	## Topics with references
	new_topic_df.to_csv(topic_table_out_path, index=None)
	log_files_output_paths.append(topic_table_out_path)

	## Reference table mapping response numbers to topics
	new_reference_df.to_csv(reference_table_out_path, index=None)
	out_file_paths.append(reference_table_out_path)

	## Unique topic list
	new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]) #.drop_duplicates('Subtopic')

	new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
	out_file_paths.append(unique_topics_df_out_path)

	# Outputs for markdown table output
	unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
	unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)

	#whole_conversation_metadata.append(whole_conversation_metadata_str)
	whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)


	#out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
	#log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]

	out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
	log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]

	#print("out_file_paths at end of loop:", out_file_paths)

	# If this is the first batch, run this
	else:
	#system_prompt = system_prompt + normalised_simple_markdown_table

	# Prepare Gemini models before query
	if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
	print("Using Gemini model:", model_choice)
	model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
	elif model_choice in ["gemma_2b_it_local"]:
	print("Using local Gemma 2b model")
	else:
	print("Using AWS Bedrock model:", model_choice)

	formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)

	formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table)

	if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table)
	else: formatted_prompt2 = prompt2

	if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table)
	else: formatted_prompt3 = prompt3

	if model_choice == "gemma_2b_it_local":
	formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
	formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
	formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix

	batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] # Adjust this list to send fewer requests

	whole_conversation = [formatted_initial_table_system_prompt]

	responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS)


	topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_table_df, markdown_table, reference_df, new_unique_topics_df, batch_file_path_details, is_error = write_llm_output_and_logs_verify(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=True)

	# If error in table parsing, leave function
	if is_error == True:
	raise Exception("Error in output table parsing")
	# unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out


	#all_topic_tables_df.append(topic_table_df)

	topic_table_df.to_csv(topic_table_out_path, index=None)
	out_file_paths.append(topic_table_out_path)

	reference_df.to_csv(reference_table_out_path, index=None)
	out_file_paths.append(reference_table_out_path)

	## Unique topic list

	new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df])

	new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
	out_file_paths.append(unique_topics_df_out_path)

	#all_markdown_topic_tables.append(markdown_table)

	whole_conversation_metadata.append(whole_conversation_metadata_str)
	whole_conversation_metadata_str = '. '.join(whole_conversation_metadata)

	# Write final output to text file also
	try:
	final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"

	if isinstance(responses[-1], ResponseObject):
	with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
	f.write(responses[-1].text)
	unique_table_df_display_table_markdown = responses[-1].text
	elif "choices" in responses[-1]:
	with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
	f.write(responses[-1]["choices"][0]['text'])
	unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
	else:
	with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
	f.write(responses[-1].text)
	unique_table_df_display_table_markdown = responses[-1].text

	log_files_output_paths.append(final_table_output_path)

	except Exception as e:
	print("Error in returning model response:", e)

	new_topic_df = topic_table_df
	new_reference_df = reference_df

	else:
	print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)

	# Increase latest file completed count unless we are over the last batch number
	if latest_batch_completed <= num_batches:
	print("Completed batch number:", str(reported_batch_no))
	latest_batch_completed += 1

	toc = time.perf_counter()
	final_time = toc - tic

	if final_time > max_time_for_loop:
	print("Max time reached, breaking loop.")
	topics_loop.close()
	tqdm._instances.clear()
	break

	# Overwrite 'existing' elements to add new tables
	existing_reference_df = new_reference_df.dropna(how='all')
	existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
	existing_topics_table = new_topic_df.dropna(how='all')

	# The topic table that can be modified does not need the summary column
	modifiable_unique_topics_df = existing_unique_topics_df#.drop("Summary", axis=1)

	out_time = f"{final_time:0.1f} seconds."

	out_message.append('All queries successfully completed in')

	final_message_out = '\n'.join(out_message)
	final_message_out = final_message_out + " " + out_time

	print(final_message_out)

	# If we have extracted topics from the last batch, return the input out_message and file list to the relevant components
	if latest_batch_completed >= num_batches:
	print("Last batch reached, returning batch:", str(latest_batch_completed))
	# Set to a very high number so as not to mess with subsequent file processing by the user
	#latest_batch_completed = 999

	toc = time.perf_counter()
	final_time = (toc - tic) + time_taken
	out_time = f"Everything finished in {round(final_time,1)} seconds."
	print(out_time)

	print("All summaries completed. Creating outputs.")

	model_choice_clean = model_name_map[model_choice]
	# Example usage
	in_column_cleaned = clean_column_name(chosen_cols, max_length=20)

	# Need to reduce output file names as full length files may be too long
	file_name = clean_column_name(file_name, max_length=30)

	# Save outputs for each batch. If master file created, label file as master
	file_path_details = f"{file_name}_col_{in_column_cleaned}"

	# Create a pivoted reference table
	#existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)

	# Save the new DataFrame to CSV
	#topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
	#reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
	reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
	unique_topics_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
	basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"

	## Reference table mapping response numbers to topics
	existing_reference_df.to_csv(reference_table_out_path, index=None)
	out_file_paths.append(reference_table_out_path)

	# Create final unique topics table from reference table to ensure consistent numbers
	final_out_unique_topics_df = existing_unique_topics_df #create_unique_table_df_from_reference_table(existing_reference_df)

	## Unique topic list
	final_out_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
	out_file_paths.append(unique_topics_df_out_path)

	# Ensure that we are only returning the final results to outputs
	out_file_paths = [x for x in out_file_paths if '_final_' in x]

	## Reference table mapping response numbers to topics
	#existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None)
	#log_files_output_paths.append(reference_table_out_pivot_path)

	## Create a dataframe for missing response references:
	# Assuming existing_reference_df and file_data are already defined
	# Simplify table to just responses column and the Response reference number

	basic_response_data = get_basic_response_data(file_data, chosen_cols, verify_titles=True)

	# Save simplified file data to log outputs
	pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None)
	log_files_output_paths.append(basic_response_data_out_path)

	# Step 1: Identify missing references
	missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]

	# Step 2: Create a new DataFrame with the same columns as existing_reference_df
	missing_df = pd.DataFrame(columns=existing_reference_df.columns)

	# Step 3: Populate the new DataFrame
	missing_df['Response References'] = missing_references['Reference']
	missing_df = missing_df.fillna(np.nan) #.infer_objects(copy=False) # Fill other columns with NA

	# Display the new DataFrame
	#print("missing_df:", missing_df)

	missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
	missing_df.to_csv(missing_df_out_path, index=None)
	log_files_output_paths.append(missing_df_out_path)

	out_file_paths = list(set(out_file_paths))
	log_files_output_paths = list(set(log_files_output_paths))

	final_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path]

	# The topic table that can be modified does not need the summary column
	modifiable_unique_topics_df = final_out_unique_topics_df#.drop("Summary", axis=1)

	print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)

	return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths


	return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths