""" Author: Khanh Phan Date: 2024-12-04 """ import string from collections import Counter import requests from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from sklearn.feature_extraction.text import TfidfVectorizer from src.application.config import ( CHUNK_SIZE, GOOGLE_API_KEY, GOOGLE_ENDPOINT_URL, NUM_CHUNKS, NUM_FREQUENT_WORDS, NUM_KEYWORDS, SEARCH_ENGINE_ID, STOPWORDS_LANG, TOP_SEARCH_RESUTLS, ) from src.application.text.entity import extract_entities def search_by_google( query, num_results=TOP_SEARCH_RESUTLS, is_exact_terms=False, ) -> dict: """ Performs a Google Custom Search API query. Args: query (str): The search query string. num_results (int, optional): The number of search results to return. Defaults to TOP_SEARCH_RESUTLS. is_exact_terms (bool, optional): use an exact phrase search or not. Defaults to False. Returns: dict: JSON response from the Google Custom Search API, None if an error occurs. """ params = { "key": GOOGLE_API_KEY, "cx": SEARCH_ENGINE_ID, "num": num_results, } if is_exact_terms: params["exactTerms"] = query else: params["q"] = query.replace('"', "") response = requests.get(GOOGLE_ENDPOINT_URL, params=params) if response.status_code == 200: return response.json() else: print(f"Error: {response.status_code}, {response.text}") return None def get_most_frequent_words( input_text: str, number_word: int = NUM_FREQUENT_WORDS, ) -> str: """ Extracts the most frequent words from the input text and forms a search phrase. Args: input_text (str): The text from which to extract frequent words. number_word (int, optional): The number of frequent words to extract. Returns: str: A search phrase consisting of the most frequent words. """ # Check if the input text is valid if not isinstance(input_text, str) or not input_text: return None # Tokenize the input text into words and convert to lowercase words = word_tokenize(input_text.lower()) # Get the set of stop words for the specified language stop_words = set(stopwords.words(STOPWORDS_LANG)) # Get the set of punctuation characters punctuation = set(string.punctuation) # Filter out stop words, punctuation, and non-alphanumeric words filtered_words = [ word for word in words if word.isalnum() and word not in stop_words and word not in punctuation ] # Count the frequency of each filtered word word_frequencies = Counter(filtered_words) # Get the most common words and their frequencies top_words = word_frequencies.most_common(number_word) for top_word in top_words: words.append(top_word[0]) # Construct the search phrase if len(words) > NUM_FREQUENT_WORDS: search_phrase = " ".join(words[:NUM_FREQUENT_WORDS]) else: search_phrase = " ".join(words[:number_word]) return search_phrase def get_chunk( input_text: str, chunk_size: int = CHUNK_SIZE, num_chunk: int = NUM_CHUNKS, ) -> list[str]: """ Splits the input text into chunks of a specified size. Args: input_text (str): The text to be chunked. chunk_size (int, optional): The number of words per chunk. num_chunk (int, optional): The number of chunks to generate. Returns: list: A list of chunks of the input text. """ if not isinstance(input_text, str): return [] chunks = [] input_words = input_text.split() # Split by any whitespace for i in range(num_chunk): # Calculate the start and end indices for the current chunk start_index = i * chunk_size end_index = (i + 1) * chunk_size # Extract the words for the current chunk and join them into a string chunk = " ".join(input_words[start_index:end_index]) if chunk: # Only append non-empty chunks chunks.append(chunk) return chunks def get_keywords(text: str, num_keywords: int = NUM_KEYWORDS) -> list[str]: """ Extracts the top keywords from a given text using the TF-IDF method. Args: text (str): The input text from which to extract keywords. num_keywords (int, optional): The number of top keywords to return. Returns: list: A list of strings representing the top keywords extracted from the text. """ # Create a TF-IDF Vectorizer vectorizer = TfidfVectorizer(stop_words=STOPWORDS_LANG) # Fit and transform the text tfidf_matrix = vectorizer.fit_transform([text]) # Get feature names (words) feature_names = vectorizer.get_feature_names_out() # Get TF-IDF scores tfidf_scores = tfidf_matrix.toarray()[0] # Sort words by TF-IDF score word_scores = list(zip(feature_names, tfidf_scores)) word_scores.sort(key=lambda x: x[1], reverse=True) # Return top keywords return [word for word, score in word_scores[:num_keywords]] def generate_search_phrases(input_text: str) -> list[str]: """ Generates different types of phrases for search purposes. Args: input_text: The input text. Returns: A list containing: - A list of most frequent words. - The original input text. - A list of text chunks. - A text without entities. """ if not isinstance(input_text, str): return [] search_phrases = [] # Method 1: Get most frequent words search_phrases.append(get_most_frequent_words(input_text)) # Method 2: Get the whole text search_phrases.append(input_text) # Method 3: Split text by chunks search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes # Method 4: Remove identities and key words entities = extract_entities(input_text) text_without_entities = remove_identities_from_text(input_text, entities) search_phrases.append(text_without_entities) # keywords = get_keywords(input_text, 16) # search_phrase = " ".join(entities) + " " + " ".join(keywords) # search_phrases.append(search_phrase) # TODO: for demo purposes return search_phrases def remove_identities_from_text(input_text: str, entities: list[str]) -> str: """ Removes entities from the input text. Args: input_text: The input text as a string. entities: A list of entities to be removed. """ for entity in entities: input_text = input_text.replace(entity, "") return input_text