Commit
·
4c95b3c
1
Parent(s):
f188b10
Fix for fuzzy matching
Browse files- tools/file_redaction.py +5 -3
- tools/load_spacy_model_custom_recognisers.py +111 -111
tools/file_redaction.py
CHANGED
@@ -468,10 +468,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
468 |
### Language check - check if selected language packs exist
|
469 |
try:
|
470 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
|
471 |
-
|
472 |
-
|
|
|
473 |
|
474 |
-
|
|
|
475 |
load_spacy_model(language)
|
476 |
|
477 |
except Exception as e:
|
|
|
468 |
### Language check - check if selected language packs exist
|
469 |
try:
|
470 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
|
471 |
+
if language != "en":
|
472 |
+
progress(0.1, desc=f"Downloading Tesseract language pack for {language}")
|
473 |
+
download_tesseract_lang_pack(language)
|
474 |
|
475 |
+
if language != "en":
|
476 |
+
progress(0.1, desc=f"Loading SpaCy model for {language}")
|
477 |
load_spacy_model(language)
|
478 |
|
479 |
except Exception as e:
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -396,113 +396,6 @@ def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
|
|
396 |
|
397 |
return start_positions, end_positions
|
398 |
|
399 |
-
def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
|
400 |
-
''' Conduct fuzzy match on a list of text data.'''
|
401 |
-
|
402 |
-
all_matches = []
|
403 |
-
all_start_positions = []
|
404 |
-
all_end_positions = []
|
405 |
-
all_ratios = []
|
406 |
-
|
407 |
-
#print("custom_query_list:", custom_query_list)
|
408 |
-
|
409 |
-
if not text:
|
410 |
-
out_message = "No text data found. Skipping page."
|
411 |
-
print(out_message)
|
412 |
-
return all_start_positions, all_end_positions
|
413 |
-
|
414 |
-
for string_query in custom_query_list:
|
415 |
-
|
416 |
-
#print("text:", text)
|
417 |
-
#print("string_query:", string_query)
|
418 |
-
|
419 |
-
query = nlp(string_query)
|
420 |
-
|
421 |
-
if search_whole_phrase == False:
|
422 |
-
# Keep only words that are not stop words
|
423 |
-
token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
|
424 |
-
|
425 |
-
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
|
426 |
-
|
427 |
-
#print("token_query:", token_query)
|
428 |
-
|
429 |
-
if len(token_query) > 1:
|
430 |
-
#pattern_lemma = [{"LEMMA": {"IN": query}}]
|
431 |
-
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
|
432 |
-
else:
|
433 |
-
#pattern_lemma = [{"LEMMA": query[0]}]
|
434 |
-
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
|
435 |
-
|
436 |
-
matcher = Matcher(nlp.vocab)
|
437 |
-
matcher.add(string_query, [pattern_fuzz])
|
438 |
-
#matcher.add(string_query, [pattern_lemma])
|
439 |
-
|
440 |
-
else:
|
441 |
-
# If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
|
442 |
-
#tokenised_query = [string_query.lower()]
|
443 |
-
# If you want to match the whole phrase, use phrase matcher
|
444 |
-
matcher = FuzzyMatcher(nlp.vocab)
|
445 |
-
patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
|
446 |
-
matcher.add("PHRASE", patterns, [{"ignore_case": True}])
|
447 |
-
|
448 |
-
batch_size = 256
|
449 |
-
docs = nlp.pipe([text], batch_size=batch_size)
|
450 |
-
|
451 |
-
# Get number of matches per doc
|
452 |
-
for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
|
453 |
-
matches = matcher(doc)
|
454 |
-
match_count = len(matches)
|
455 |
-
|
456 |
-
# If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
|
457 |
-
if search_whole_phrase==False:
|
458 |
-
all_matches.append(match_count)
|
459 |
-
|
460 |
-
for match_id, start, end in matches:
|
461 |
-
span = str(doc[start:end]).strip()
|
462 |
-
query_search = str(query).strip()
|
463 |
-
#print("doc:", doc)
|
464 |
-
#print("span:", span)
|
465 |
-
#print("query_search:", query_search)
|
466 |
-
|
467 |
-
# Convert word positions to character positions
|
468 |
-
start_char = doc[start].idx # Start character position
|
469 |
-
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
470 |
-
|
471 |
-
# The positions here are word position, not character position
|
472 |
-
all_matches.append(match_count)
|
473 |
-
all_start_positions.append(start_char)
|
474 |
-
all_end_positions.append(end_char)
|
475 |
-
|
476 |
-
else:
|
477 |
-
for match_id, start, end, ratio, pattern in matches:
|
478 |
-
span = str(doc[start:end]).strip()
|
479 |
-
query_search = str(query).strip()
|
480 |
-
#print("doc:", doc)
|
481 |
-
#print("span:", span)
|
482 |
-
#print("query_search:", query_search)
|
483 |
-
|
484 |
-
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
485 |
-
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
486 |
-
|
487 |
-
#print("Levenshtein distance:", distance)
|
488 |
-
|
489 |
-
if distance > spelling_mistakes_max:
|
490 |
-
match_count = match_count - 1
|
491 |
-
else:
|
492 |
-
# Convert word positions to character positions
|
493 |
-
start_char = doc[start].idx # Start character position
|
494 |
-
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
495 |
-
|
496 |
-
#print("start_char:", start_char)
|
497 |
-
#print("end_char:", end_char)
|
498 |
-
|
499 |
-
all_matches.append(match_count)
|
500 |
-
all_start_positions.append(start_char)
|
501 |
-
all_end_positions.append(end_char)
|
502 |
-
all_ratios.append(ratio)
|
503 |
-
|
504 |
-
|
505 |
-
return all_start_positions, all_end_positions
|
506 |
|
507 |
class CustomWordFuzzyRecognizer(EntityRecognizer):
|
508 |
def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
|
@@ -537,13 +430,11 @@ class CustomWordFuzzyRecognizer(EntityRecognizer):
|
|
537 |
custom_list_default = []
|
538 |
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
539 |
|
540 |
-
|
541 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
542 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
|
543 |
|
544 |
-
|
545 |
def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
|
546 |
-
spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None):
|
547 |
"""
|
548 |
Create an nlp_analyser object based on the specified language input.
|
549 |
|
@@ -552,6 +443,8 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
|
|
552 |
custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
|
553 |
spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
|
554 |
search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
|
|
|
|
|
555 |
|
556 |
Returns:
|
557 |
AnalyzerEngine: Configured nlp_analyser object with custom recognizers
|
@@ -606,10 +499,117 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
|
|
606 |
nlp_analyser.registry.add_recognizer(street_recogniser)
|
607 |
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
608 |
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
|
|
|
|
|
|
609 |
|
610 |
return nlp_analyser
|
611 |
|
612 |
# Create the default nlp_analyser using the new function
|
613 |
-
nlp_analyser = create_nlp_analyser(DEFAULT_LANGUAGE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
614 |
|
615 |
|
|
|
396 |
|
397 |
return start_positions, end_positions
|
398 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
|
400 |
class CustomWordFuzzyRecognizer(EntityRecognizer):
|
401 |
def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
|
|
|
430 |
custom_list_default = []
|
431 |
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
432 |
|
|
|
433 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
434 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
|
435 |
|
|
|
436 |
def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
|
437 |
+
spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None, return_also_model: bool = False):
|
438 |
"""
|
439 |
Create an nlp_analyser object based on the specified language input.
|
440 |
|
|
|
443 |
custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
|
444 |
spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
|
445 |
search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
|
446 |
+
existing_nlp_analyser (AnalyzerEngine, optional): Existing nlp_analyser object to use. Defaults to None.
|
447 |
+
return_also_model (bool, optional): Whether to return the nlp_model object as well. Defaults to False.
|
448 |
|
449 |
Returns:
|
450 |
AnalyzerEngine: Configured nlp_analyser object with custom recognizers
|
|
|
499 |
nlp_analyser.registry.add_recognizer(street_recogniser)
|
500 |
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
501 |
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
502 |
+
|
503 |
+
if return_also_model:
|
504 |
+
return nlp_analyser, nlp_model
|
505 |
|
506 |
return nlp_analyser
|
507 |
|
508 |
# Create the default nlp_analyser using the new function
|
509 |
+
nlp_analyser, nlp_model = create_nlp_analyser(DEFAULT_LANGUAGE, return_also_model=True)
|
510 |
+
|
511 |
+
def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp_model, progress=gr.Progress(track_tqdm=True)):
|
512 |
+
''' Conduct fuzzy match on a list of text data.'''
|
513 |
+
|
514 |
+
all_matches = []
|
515 |
+
all_start_positions = []
|
516 |
+
all_end_positions = []
|
517 |
+
all_ratios = []
|
518 |
+
|
519 |
+
#print("custom_query_list:", custom_query_list)
|
520 |
+
|
521 |
+
if not text:
|
522 |
+
out_message = "No text data found. Skipping page."
|
523 |
+
print(out_message)
|
524 |
+
return all_start_positions, all_end_positions
|
525 |
+
|
526 |
+
for string_query in custom_query_list:
|
527 |
+
|
528 |
+
query = nlp(string_query)
|
529 |
+
|
530 |
+
if search_whole_phrase == False:
|
531 |
+
# Keep only words that are not stop words
|
532 |
+
token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
|
533 |
+
|
534 |
+
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
|
535 |
+
|
536 |
+
if len(token_query) > 1:
|
537 |
+
#pattern_lemma = [{"LEMMA": {"IN": query}}]
|
538 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
|
539 |
+
else:
|
540 |
+
#pattern_lemma = [{"LEMMA": query[0]}]
|
541 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
|
542 |
+
|
543 |
+
matcher = Matcher(nlp.vocab)
|
544 |
+
matcher.add(string_query, [pattern_fuzz])
|
545 |
+
#matcher.add(string_query, [pattern_lemma])
|
546 |
+
|
547 |
+
else:
|
548 |
+
# If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
|
549 |
+
#tokenised_query = [string_query.lower()]
|
550 |
+
# If you want to match the whole phrase, use phrase matcher
|
551 |
+
matcher = FuzzyMatcher(nlp.vocab)
|
552 |
+
patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
|
553 |
+
matcher.add("PHRASE", patterns, [{"ignore_case": True}])
|
554 |
+
|
555 |
+
batch_size = 256
|
556 |
+
docs = nlp.pipe([text], batch_size=batch_size)
|
557 |
+
|
558 |
+
# Get number of matches per doc
|
559 |
+
for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
|
560 |
+
matches = matcher(doc)
|
561 |
+
match_count = len(matches)
|
562 |
+
|
563 |
+
# If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
|
564 |
+
if search_whole_phrase==False:
|
565 |
+
all_matches.append(match_count)
|
566 |
+
|
567 |
+
for match_id, start, end in matches:
|
568 |
+
span = str(doc[start:end]).strip()
|
569 |
+
query_search = str(query).strip()
|
570 |
+
#print("doc:", doc)
|
571 |
+
#print("span:", span)
|
572 |
+
#print("query_search:", query_search)
|
573 |
+
|
574 |
+
# Convert word positions to character positions
|
575 |
+
start_char = doc[start].idx # Start character position
|
576 |
+
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
577 |
+
|
578 |
+
# The positions here are word position, not character position
|
579 |
+
all_matches.append(match_count)
|
580 |
+
all_start_positions.append(start_char)
|
581 |
+
all_end_positions.append(end_char)
|
582 |
+
|
583 |
+
else:
|
584 |
+
for match_id, start, end, ratio, pattern in matches:
|
585 |
+
span = str(doc[start:end]).strip()
|
586 |
+
query_search = str(query).strip()
|
587 |
+
#print("doc:", doc)
|
588 |
+
#print("span:", span)
|
589 |
+
#print("query_search:", query_search)
|
590 |
+
|
591 |
+
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
592 |
+
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
593 |
+
|
594 |
+
#print("Levenshtein distance:", distance)
|
595 |
+
|
596 |
+
if distance > spelling_mistakes_max:
|
597 |
+
match_count = match_count - 1
|
598 |
+
else:
|
599 |
+
# Convert word positions to character positions
|
600 |
+
start_char = doc[start].idx # Start character position
|
601 |
+
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
602 |
+
|
603 |
+
#print("start_char:", start_char)
|
604 |
+
#print("end_char:", end_char)
|
605 |
+
|
606 |
+
all_matches.append(match_count)
|
607 |
+
all_start_positions.append(start_char)
|
608 |
+
all_end_positions.append(end_char)
|
609 |
+
all_ratios.append(ratio)
|
610 |
+
|
611 |
+
|
612 |
+
return all_start_positions, all_end_positions
|
613 |
+
|
614 |
|
615 |
|