from collections import Counter import os import string import requests from dotenv import load_dotenv from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize from sklearn.feature_extraction.text import TfidfVectorizer from src.application.text.identity import extract_entities load_dotenv() GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID") def search_by_google( query, num_results=10, is_exact_terms = False ) -> dict: """ Searches the Google Custom Search Engine for the given query. Args: query: The search query. is_exact_terms: Whether to use exact terms search (True) or regular search (False). num_results: The number of results to return (default: 10). Returns: A dictionary containing the search results or None if there was an error. """ url = "https://www.googleapis.com/customsearch/v1" params = { "key": GOOGLE_API_KEY, "cx": SEARCH_ENGINE_ID, "num": num_results, } if is_exact_terms: params["exactTerms"] = query else: params["q"] = query.replace('"', "") response = requests.get(url, params=params) if response.status_code == 200: return response.json() else: print(f"Error: {response.status_code}, {response.text}") return None def get_most_frequent_words(input_text, number_word=32): """ Gets the top words from the input text, excluding stop words and punctuation. Args: input_text: The input text as a string. number_word: The number of top words to return. Returns: A list of tuples, where each tuple contains a word and its frequency. Returns an empty list if input is not a string or is empty. """ if not isinstance(input_text, str) or not input_text: return [] words = word_tokenize(input_text.lower()) # Tokenize and lowercase stop_words = set(stopwords.words('english')) punctuation = set(string.punctuation) # get all punctuation filtered_words = [ word for word in words if word.isalnum() and word not in stop_words and word not in punctuation ] word_frequencies = Counter(filtered_words) top_words = word_frequencies.most_common(number_word) for top_word in top_words: words.append(top_word[0]) if len(words) > 32: search_phrase = " ".join(words[:32]) else: search_phrase = " ".join(words[:number_word]) return search_phrase def get_chunk(input_text, chunk_length=32, num_chunk=3): """ Splits the input text into chunks of a specified length. Args: input_text: The input text as a string. num_chunk: The maximum number of chunks to create. chunk_length: The desired length of each chunk (in words). Returns: A list of string chunks. Returns an empty list if input is invalid. """ if not isinstance(input_text, str): return [] chunks = [] input_words = input_text.split() # Split by any whitespace for i in range(num_chunk): start_index = i * chunk_length end_index = (i + 1) * chunk_length chunk = " ".join(input_words[start_index:end_index]) if chunk: # Only append non-empty chunks chunks.append(chunk) return chunks def get_keywords(text, num_keywords=5): """Return top k keywords from a doc using TF-IDF method""" # Create a TF-IDF Vectorizer vectorizer = TfidfVectorizer(stop_words='english') # Fit and transform the text tfidf_matrix = vectorizer.fit_transform([text]) # Get feature names (words) feature_names = vectorizer.get_feature_names_out() # Get TF-IDF scores tfidf_scores = tfidf_matrix.toarray()[0] # Sort words by TF-IDF score word_scores = list(zip(feature_names, tfidf_scores)) word_scores.sort(key=lambda x: x[1], reverse=True) # Return top keywords return [word for word, score in word_scores[:num_keywords]] def generate_search_phrases(input_text): """ Generates different types of phrases for search purposes. Args: input_text: The input text. Returns: A list containing: - A list of most frequent words. - The original input text. - A list of text chunks. """ if not isinstance(input_text, str): return [] search_phrases = [] # Method 1: Get most frequent words search_phrases.append(get_most_frequent_words(input_text)) # Method 2: Get the whole text search_phrases.append(input_text) # Method 3: Split text by chunks search_phrases.extend(get_chunk(input_text)) # Method 4: Get most identities and key words entities = extract_entities(input_text) keywords = get_keywords(input_text, 16) search_phrase = " ".join(entities) + " " + " ".join(keywords) search_phrases.append(search_phrase) return search_phrases