Spaces:

aquibmoin
/

Semantic-Search-with-IndusST

Sleeping

App Files Files Community

Semantic-Search-with-IndusST / app.py

aquibmoin

Update app.py

5112962 verified about 1 year ago

raw

history blame

2.73 kB

	import gradio as gr
	import requests
	import os
	import re

	API_TOKEN = os.getenv('API_TOKEN')
	API_URL = "https://api-inference.huggingface.co/models/nasa-impact/nasa-smd-ibm-st-v2"
	headers = {"Authorization": f"Bearer {API_TOKEN}"}

	def query_similarity(source_sentence, sentences):
	payload = {
	"inputs": {
	"source_sentence": source_sentence,
	"sentences": sentences
	}
	}
	response = requests.post(API_URL, headers=headers, json=payload)

	# Ensure response is JSON
	try:
	return response.json(), sentences
	except json.JSONDecodeError:
	return {"error": "Failed to decode JSON response"}, sentences

	def format_output(response, sentences):
	if isinstance(response, list):
	# Pair each score with its corresponding sentence
	results = list(zip(response, sentences))
	# Sort results by score in descending order
	results = sorted(results, key=lambda x: x[0], reverse=True)
	# Format the output
	formatted_results = []
	for score, sentence in results:
	formatted_results.append(f"Sentence: {sentence.strip()}, Score: {score:.4f}\n")
	return "\n".join(formatted_results)
	else:
	return f"Unexpected response format: {response}"

	def split_into_chunks(text, chunk_size=100):
	sentences = re.split(r'(?<=[.!?]) +', text) # Split text into sentences
	chunks = []
	current_chunk = []
	current_length = 0

	for sentence in sentences:
	sentence_length = len(sentence.split())
	if current_length + sentence_length > chunk_size:
	chunks.append(" ".join(current_chunk))
	current_chunk = [sentence]
	current_length = sentence_length
	else:
	current_chunk.append(sentence)
	current_length += sentence_length

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	return chunks

	def semantic_search(query, file_path):
	if file_path is not None:
	with open(file_path, 'r', encoding='utf-8') as file:
	document = file.read()
	chunks = split_into_chunks(document)
	response, sentences = query_similarity(query, chunks)
	return format_output(response, sentences)
	else:
	return "Please upload a .txt file."

	# Define Gradio interface
	iface = gr.Interface(
	fn=semantic_search,
	inputs=[
	gr.Textbox(lines=2, placeholder="Enter your query here..."),
	gr.File(file_types=['txt'], label="Upload a .txt file")
	],
	outputs="text",
	title="Document Semantic Search",
	description="Input a query and upload a document (.txt) to find the most semantically similar paragraphs or sentences."
	)

	iface.launch()