seanpedrickcase commited on
Commit
4c95b3c
·
1 Parent(s): f188b10

Fix for fuzzy matching

Browse files
tools/file_redaction.py CHANGED
@@ -468,10 +468,12 @@ def choose_and_run_redactor(file_paths:List[str],
468
  ### Language check - check if selected language packs exist
469
  try:
470
  if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
471
- progress(0.1, desc=f"Downloading Tesseract language pack for {language}")
472
- download_tesseract_lang_pack(language)
 
473
 
474
- progress(0.1, desc=f"Loading SpaCy model for {language}")
 
475
  load_spacy_model(language)
476
 
477
  except Exception as e:
 
468
  ### Language check - check if selected language packs exist
469
  try:
470
  if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
471
+ if language != "en":
472
+ progress(0.1, desc=f"Downloading Tesseract language pack for {language}")
473
+ download_tesseract_lang_pack(language)
474
 
475
+ if language != "en":
476
+ progress(0.1, desc=f"Loading SpaCy model for {language}")
477
  load_spacy_model(language)
478
 
479
  except Exception as e:
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -396,113 +396,6 @@ def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
396
 
397
  return start_positions, end_positions
398
 
399
- def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
400
- ''' Conduct fuzzy match on a list of text data.'''
401
-
402
- all_matches = []
403
- all_start_positions = []
404
- all_end_positions = []
405
- all_ratios = []
406
-
407
- #print("custom_query_list:", custom_query_list)
408
-
409
- if not text:
410
- out_message = "No text data found. Skipping page."
411
- print(out_message)
412
- return all_start_positions, all_end_positions
413
-
414
- for string_query in custom_query_list:
415
-
416
- #print("text:", text)
417
- #print("string_query:", string_query)
418
-
419
- query = nlp(string_query)
420
-
421
- if search_whole_phrase == False:
422
- # Keep only words that are not stop words
423
- token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
424
-
425
- spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
426
-
427
- #print("token_query:", token_query)
428
-
429
- if len(token_query) > 1:
430
- #pattern_lemma = [{"LEMMA": {"IN": query}}]
431
- pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
432
- else:
433
- #pattern_lemma = [{"LEMMA": query[0]}]
434
- pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
435
-
436
- matcher = Matcher(nlp.vocab)
437
- matcher.add(string_query, [pattern_fuzz])
438
- #matcher.add(string_query, [pattern_lemma])
439
-
440
- else:
441
- # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
442
- #tokenised_query = [string_query.lower()]
443
- # If you want to match the whole phrase, use phrase matcher
444
- matcher = FuzzyMatcher(nlp.vocab)
445
- patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
446
- matcher.add("PHRASE", patterns, [{"ignore_case": True}])
447
-
448
- batch_size = 256
449
- docs = nlp.pipe([text], batch_size=batch_size)
450
-
451
- # Get number of matches per doc
452
- for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
453
- matches = matcher(doc)
454
- match_count = len(matches)
455
-
456
- # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
457
- if search_whole_phrase==False:
458
- all_matches.append(match_count)
459
-
460
- for match_id, start, end in matches:
461
- span = str(doc[start:end]).strip()
462
- query_search = str(query).strip()
463
- #print("doc:", doc)
464
- #print("span:", span)
465
- #print("query_search:", query_search)
466
-
467
- # Convert word positions to character positions
468
- start_char = doc[start].idx # Start character position
469
- end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
470
-
471
- # The positions here are word position, not character position
472
- all_matches.append(match_count)
473
- all_start_positions.append(start_char)
474
- all_end_positions.append(end_char)
475
-
476
- else:
477
- for match_id, start, end, ratio, pattern in matches:
478
- span = str(doc[start:end]).strip()
479
- query_search = str(query).strip()
480
- #print("doc:", doc)
481
- #print("span:", span)
482
- #print("query_search:", query_search)
483
-
484
- # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
485
- distance = Levenshtein.distance(query_search.lower(), span.lower())
486
-
487
- #print("Levenshtein distance:", distance)
488
-
489
- if distance > spelling_mistakes_max:
490
- match_count = match_count - 1
491
- else:
492
- # Convert word positions to character positions
493
- start_char = doc[start].idx # Start character position
494
- end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
495
-
496
- #print("start_char:", start_char)
497
- #print("end_char:", end_char)
498
-
499
- all_matches.append(match_count)
500
- all_start_positions.append(start_char)
501
- all_end_positions.append(end_char)
502
- all_ratios.append(ratio)
503
-
504
-
505
- return all_start_positions, all_end_positions
506
 
507
  class CustomWordFuzzyRecognizer(EntityRecognizer):
508
  def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
@@ -537,13 +430,11 @@ class CustomWordFuzzyRecognizer(EntityRecognizer):
537
  custom_list_default = []
538
  custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
539
 
540
-
541
  # Pass the loaded model to the new LoadedSpacyNlpEngine
542
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
543
 
544
-
545
  def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
546
- spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None):
547
  """
548
  Create an nlp_analyser object based on the specified language input.
549
 
@@ -552,6 +443,8 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
552
  custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
553
  spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
554
  search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
 
 
555
 
556
  Returns:
557
  AnalyzerEngine: Configured nlp_analyser object with custom recognizers
@@ -606,10 +499,117 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
606
  nlp_analyser.registry.add_recognizer(street_recogniser)
607
  nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
608
  nlp_analyser.registry.add_recognizer(titles_recogniser)
 
 
 
609
 
610
  return nlp_analyser
611
 
612
  # Create the default nlp_analyser using the new function
613
- nlp_analyser = create_nlp_analyser(DEFAULT_LANGUAGE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
 
615
 
 
396
 
397
  return start_positions, end_positions
398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
  class CustomWordFuzzyRecognizer(EntityRecognizer):
401
  def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
 
430
  custom_list_default = []
431
  custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
432
 
 
433
  # Pass the loaded model to the new LoadedSpacyNlpEngine
434
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
435
 
 
436
  def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
437
+ spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None, return_also_model: bool = False):
438
  """
439
  Create an nlp_analyser object based on the specified language input.
440
 
 
443
  custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
444
  spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
445
  search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
446
+ existing_nlp_analyser (AnalyzerEngine, optional): Existing nlp_analyser object to use. Defaults to None.
447
+ return_also_model (bool, optional): Whether to return the nlp_model object as well. Defaults to False.
448
 
449
  Returns:
450
  AnalyzerEngine: Configured nlp_analyser object with custom recognizers
 
499
  nlp_analyser.registry.add_recognizer(street_recogniser)
500
  nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
501
  nlp_analyser.registry.add_recognizer(titles_recogniser)
502
+
503
+ if return_also_model:
504
+ return nlp_analyser, nlp_model
505
 
506
  return nlp_analyser
507
 
508
  # Create the default nlp_analyser using the new function
509
+ nlp_analyser, nlp_model = create_nlp_analyser(DEFAULT_LANGUAGE, return_also_model=True)
510
+
511
+ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp_model, progress=gr.Progress(track_tqdm=True)):
512
+ ''' Conduct fuzzy match on a list of text data.'''
513
+
514
+ all_matches = []
515
+ all_start_positions = []
516
+ all_end_positions = []
517
+ all_ratios = []
518
+
519
+ #print("custom_query_list:", custom_query_list)
520
+
521
+ if not text:
522
+ out_message = "No text data found. Skipping page."
523
+ print(out_message)
524
+ return all_start_positions, all_end_positions
525
+
526
+ for string_query in custom_query_list:
527
+
528
+ query = nlp(string_query)
529
+
530
+ if search_whole_phrase == False:
531
+ # Keep only words that are not stop words
532
+ token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
533
+
534
+ spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
535
+
536
+ if len(token_query) > 1:
537
+ #pattern_lemma = [{"LEMMA": {"IN": query}}]
538
+ pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
539
+ else:
540
+ #pattern_lemma = [{"LEMMA": query[0]}]
541
+ pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
542
+
543
+ matcher = Matcher(nlp.vocab)
544
+ matcher.add(string_query, [pattern_fuzz])
545
+ #matcher.add(string_query, [pattern_lemma])
546
+
547
+ else:
548
+ # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
549
+ #tokenised_query = [string_query.lower()]
550
+ # If you want to match the whole phrase, use phrase matcher
551
+ matcher = FuzzyMatcher(nlp.vocab)
552
+ patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
553
+ matcher.add("PHRASE", patterns, [{"ignore_case": True}])
554
+
555
+ batch_size = 256
556
+ docs = nlp.pipe([text], batch_size=batch_size)
557
+
558
+ # Get number of matches per doc
559
+ for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
560
+ matches = matcher(doc)
561
+ match_count = len(matches)
562
+
563
+ # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
564
+ if search_whole_phrase==False:
565
+ all_matches.append(match_count)
566
+
567
+ for match_id, start, end in matches:
568
+ span = str(doc[start:end]).strip()
569
+ query_search = str(query).strip()
570
+ #print("doc:", doc)
571
+ #print("span:", span)
572
+ #print("query_search:", query_search)
573
+
574
+ # Convert word positions to character positions
575
+ start_char = doc[start].idx # Start character position
576
+ end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
577
+
578
+ # The positions here are word position, not character position
579
+ all_matches.append(match_count)
580
+ all_start_positions.append(start_char)
581
+ all_end_positions.append(end_char)
582
+
583
+ else:
584
+ for match_id, start, end, ratio, pattern in matches:
585
+ span = str(doc[start:end]).strip()
586
+ query_search = str(query).strip()
587
+ #print("doc:", doc)
588
+ #print("span:", span)
589
+ #print("query_search:", query_search)
590
+
591
+ # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
592
+ distance = Levenshtein.distance(query_search.lower(), span.lower())
593
+
594
+ #print("Levenshtein distance:", distance)
595
+
596
+ if distance > spelling_mistakes_max:
597
+ match_count = match_count - 1
598
+ else:
599
+ # Convert word positions to character positions
600
+ start_char = doc[start].idx # Start character position
601
+ end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
602
+
603
+ #print("start_char:", start_char)
604
+ #print("end_char:", end_char)
605
+
606
+ all_matches.append(match_count)
607
+ all_start_positions.append(start_char)
608
+ all_end_positions.append(end_char)
609
+ all_ratios.append(ratio)
610
+
611
+
612
+ return all_start_positions, all_end_positions
613
+
614
 
615