Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Aug 21

Commit

4c95b3c

1 Parent(s): f188b10

Fix for fuzzy matching

Browse files

Files changed (2) hide show

tools/file_redaction.py +5 -3
tools/load_spacy_model_custom_recognisers.py +111 -111

tools/file_redaction.py CHANGED Viewed

@@ -468,10 +468,12 @@ def choose_and_run_redactor(file_paths:List[str],
     ### Language check - check if selected language packs exist
     try:
         if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
-            progress(0.1, desc=f"Downloading Tesseract language pack for {language}")
-            download_tesseract_lang_pack(language)
-        progress(0.1, desc=f"Loading SpaCy model for {language}")
         load_spacy_model(language)
     except Exception as e:

     ### Language check - check if selected language packs exist
     try:
         if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
+            if language != "en":
+                progress(0.1, desc=f"Downloading Tesseract language pack for {language}")
+                download_tesseract_lang_pack(language)
+        if language != "en":
+            progress(0.1, desc=f"Loading SpaCy model for {language}")
         load_spacy_model(language)
     except Exception as e:

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -396,113 +396,6 @@ def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
     return start_positions, end_positions
-def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
-    ''' Conduct fuzzy match on a list of text data.'''
-    all_matches = []
-    all_start_positions = []
-    all_end_positions = []
-    all_ratios = []
-    #print("custom_query_list:", custom_query_list)
-    if not text:
-        out_message = "No text data found. Skipping page."
-        print(out_message)
-        return all_start_positions, all_end_positions
-    for string_query in custom_query_list:
-        #print("text:", text)
-        #print("string_query:", string_query)
-        query = nlp(string_query)
-        if search_whole_phrase == False:
-            # Keep only words that are not stop words
-            token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
-            spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
-            #print("token_query:", token_query)
-            if len(token_query) > 1:
-                #pattern_lemma = [{"LEMMA": {"IN": query}}]
-                pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
-            else:
-                #pattern_lemma = [{"LEMMA": query[0]}]
-                pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
-            matcher = Matcher(nlp.vocab)
-            matcher.add(string_query, [pattern_fuzz])
-            #matcher.add(string_query, [pattern_lemma])
-        else:
-            # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
-            #tokenised_query = [string_query.lower()]
-            # If you want to match the whole phrase, use phrase matcher
-            matcher = FuzzyMatcher(nlp.vocab)
-            patterns = [nlp.make_doc(string_query)]  # Convert query into a Doc object
-            matcher.add("PHRASE", patterns, [{"ignore_case": True}])
-        batch_size = 256
-        docs = nlp.pipe([text], batch_size=batch_size)
-        # Get number of matches per doc
-        for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
-            matches = matcher(doc)
-            match_count = len(matches)
-            # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
-            if search_whole_phrase==False:
-                all_matches.append(match_count)
-                for match_id, start, end in matches:
-                    span = str(doc[start:end]).strip()
-                    query_search = str(query).strip()
-                    #print("doc:", doc)
-                    #print("span:", span)
-                    #print("query_search:", query_search)
-                    # Convert word positions to character positions
-                    start_char = doc[start].idx  # Start character position
-                    end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
-                    # The positions here are word position, not character position
-                    all_matches.append(match_count)
-                    all_start_positions.append(start_char)
-                    all_end_positions.append(end_char)
-            else:
-                for match_id, start, end, ratio, pattern in matches:
-                    span = str(doc[start:end]).strip()
-                    query_search = str(query).strip()
-                    #print("doc:", doc)
-                    #print("span:", span)
-                    #print("query_search:", query_search)
-                    # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
-                    distance = Levenshtein.distance(query_search.lower(), span.lower())
-                    #print("Levenshtein distance:", distance)
-                    if distance > spelling_mistakes_max:
-                        match_count = match_count - 1
-                    else:
-                        # Convert word positions to character positions
-                        start_char = doc[start].idx  # Start character position
-                        end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
-                        #print("start_char:", start_char)
-                        #print("end_char:", end_char)
-                        all_matches.append(match_count)
-                        all_start_positions.append(start_char)
-                        all_end_positions.append(end_char)
-                        all_ratios.append(ratio)
-    return all_start_positions, all_end_positions
 class CustomWordFuzzyRecognizer(EntityRecognizer):
     def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
@@ -537,13 +430,11 @@ class CustomWordFuzzyRecognizer(EntityRecognizer):
 custom_list_default = []
 custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
 # Pass the loaded model to the new LoadedSpacyNlpEngine
 loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
 def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
-                       spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None):
     """
     Create an nlp_analyser object based on the specified language input.
@@ -552,6 +443,8 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
         custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
         spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
         search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
     Returns:
         AnalyzerEngine: Configured nlp_analyser object with custom recognizers
@@ -606,10 +499,117 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
         nlp_analyser.registry.add_recognizer(street_recogniser)
         nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
         nlp_analyser.registry.add_recognizer(titles_recogniser)
     return nlp_analyser
 # Create the default nlp_analyser using the new function
-nlp_analyser = create_nlp_analyser(DEFAULT_LANGUAGE)

     return start_positions, end_positions
 class CustomWordFuzzyRecognizer(EntityRecognizer):
     def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
 custom_list_default = []
 custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
 # Pass the loaded model to the new LoadedSpacyNlpEngine
 loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
 def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
+                       spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None, return_also_model: bool = False):
     """
     Create an nlp_analyser object based on the specified language input.
         custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
         spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
         search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
+        existing_nlp_analyser (AnalyzerEngine, optional): Existing nlp_analyser object to use. Defaults to None.
+        return_also_model (bool, optional): Whether to return the nlp_model object as well. Defaults to False.
     Returns:
         AnalyzerEngine: Configured nlp_analyser object with custom recognizers
         nlp_analyser.registry.add_recognizer(street_recogniser)
         nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
         nlp_analyser.registry.add_recognizer(titles_recogniser)
+    if return_also_model:
+        return nlp_analyser, nlp_model
     return nlp_analyser
 # Create the default nlp_analyser using the new function
+nlp_analyser, nlp_model = create_nlp_analyser(DEFAULT_LANGUAGE, return_also_model=True)
+def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp_model, progress=gr.Progress(track_tqdm=True)):
+    ''' Conduct fuzzy match on a list of text data.'''
+    all_matches = []
+    all_start_positions = []
+    all_end_positions = []
+    all_ratios = []
+    #print("custom_query_list:", custom_query_list)
+    if not text:
+        out_message = "No text data found. Skipping page."
+        print(out_message)
+        return all_start_positions, all_end_positions
+    for string_query in custom_query_list:
+        query = nlp(string_query)
+        if search_whole_phrase == False:
+            # Keep only words that are not stop words
+            token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
+            spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
+            if len(token_query) > 1:
+                #pattern_lemma = [{"LEMMA": {"IN": query}}]
+                pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
+            else:
+                #pattern_lemma = [{"LEMMA": query[0]}]
+                pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
+            matcher = Matcher(nlp.vocab)
+            matcher.add(string_query, [pattern_fuzz])
+            #matcher.add(string_query, [pattern_lemma])
+        else:
+            # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
+            #tokenised_query = [string_query.lower()]
+            # If you want to match the whole phrase, use phrase matcher
+            matcher = FuzzyMatcher(nlp.vocab)
+            patterns = [nlp.make_doc(string_query)]  # Convert query into a Doc object
+            matcher.add("PHRASE", patterns, [{"ignore_case": True}])
+        batch_size = 256
+        docs = nlp.pipe([text], batch_size=batch_size)
+        # Get number of matches per doc
+        for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
+            matches = matcher(doc)
+            match_count = len(matches)
+            # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
+            if search_whole_phrase==False:
+                all_matches.append(match_count)
+                for match_id, start, end in matches:
+                    span = str(doc[start:end]).strip()
+                    query_search = str(query).strip()
+                    #print("doc:", doc)
+                    #print("span:", span)
+                    #print("query_search:", query_search)
+                    # Convert word positions to character positions
+                    start_char = doc[start].idx  # Start character position
+                    end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
+                    # The positions here are word position, not character position
+                    all_matches.append(match_count)
+                    all_start_positions.append(start_char)
+                    all_end_positions.append(end_char)
+            else:
+                for match_id, start, end, ratio, pattern in matches:
+                    span = str(doc[start:end]).strip()
+                    query_search = str(query).strip()
+                    #print("doc:", doc)
+                    #print("span:", span)
+                    #print("query_search:", query_search)
+                    # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
+                    distance = Levenshtein.distance(query_search.lower(), span.lower())
+                    #print("Levenshtein distance:", distance)
+                    if distance > spelling_mistakes_max:
+                        match_count = match_count - 1
+                    else:
+                        # Convert word positions to character positions
+                        start_char = doc[start].idx  # Start character position
+                        end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
+                        #print("start_char:", start_char)
+                        #print("end_char:", end_char)
+                        all_matches.append(match_count)
+                        all_start_positions.append(start_char)
+                        all_end_positions.append(end_char)
+                        all_ratios.append(ratio)
+    return all_start_positions, all_end_positions