## Analysis

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from PIL import Image
from scipy.stats import pearsonr
from utils.get_unique_values import get_unique_values
from utils.remove_duplicates import unzip_fn
from utils.show_tile_images import show_tile_images
import zipfile
import json
from utils.visualize_bboxes_on_image import draw_text_on_image
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import tqdm as tqdm
from functools import cache
from utils.flatten import flatten

In [None]:
# !GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features --depth=1

# !wget https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features/resolve/main/data/processed/RVL-CDIP-invoice/vectors.json.zip -P ./data/processed/RVL-CDIP-invoice/



# import sys
# sys.path.insert(0, './document-similarity-search-using-visual-layout-features')

In [None]:
# import os
# vectors_chunks = os.listdir('/content/document-similarity-search-using-visual-layout-features/data/processed/RVL-CDIP-invoice/vectors.json.zip.chunks')
# vectors_chunks.sort(key=lambda x: int(x.split('-')[0]))
# vectors_chunks

In [None]:
vectors_df = pd.read_json('./data/local-data/processed/RVL-CDIP-invoice/vectors.json.zip')
vectors_df

In [None]:
# https://gemini.google.com/app/8cd4389df12d29e6

# https://chat.openai.com/c/a345a9ec-9238-4089-a6c0-bb4d375148eb

### Correlation

In [None]:
unique_values = get_unique_values(start=0.17, end=1, count=10*1000)

def get_stats(index: int):
    vectors = vectors_df.loc[index, 'vectors']
    weighted_vectors = vectors_df.loc[index, 'weighted_vectors']
    reduced_vectors = vectors_df.loc[index, 'reduced_vectors']
    reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']
    non_zero_vectors, non_zero_uniques = unzip_fn([(vector, unique) for vector, unique in zip(vectors, unique_values) if vector > 0]) if len([i for i in vectors if i > 0]) > 0 else ([], [])

    non_zero_vectors__uniques  = pearsonr(non_zero_vectors, non_zero_uniques) if len(non_zero_vectors) > 0 else [0,1]
    vectors___unique_values = pearsonr(vectors, unique_values)
    vectors___weighted_vectors = pearsonr(vectors, weighted_vectors)
    vectors___reduced_vectors = pearsonr(vectors, reduced_vectors)
    vectors___reduced_weighted_vectors = pearsonr(vectors, reduced_weighted_vectors)
    weighted_vectors___reduced_vectors = pearsonr(weighted_vectors, reduced_vectors)
    weighted_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)
    reduced_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)

    return {
        'non_zero_vectors__uniques': non_zero_vectors__uniques,
        'vectors___unique_values': vectors___unique_values,
        'vectors___weighted_vectors': vectors___weighted_vectors,
        'vectors___reduced_vectors': vectors___reduced_vectors,
        'vectors___reduced_weighted_vectors': vectors___reduced_weighted_vectors,
        'weighted_vectors___reduced_vectors': weighted_vectors___reduced_vectors,
        'weighted_vectors___reduced_weighted_vectors': weighted_vectors___reduced_weighted_vectors,
        'reduced_vectors___reduced_weighted_vectors': reduced_vectors___reduced_weighted_vectors,
    }

from matplotlib import pyplot as plt
from scipy.signal import convolve
kernel = np.array([0.25, 0.5, 0.25])  # Example kernel for simple averaging

def smooth_vector(vector):
    # Perform convolution
    smoothed_vector = convolve(vector, kernel, mode='same') / sum(kernel)
    return smoothed_vector

def get_modified_stats(image_1_index: int, image_2_index: int, vector_column: str = 'vectors', plot = False):
    image_1_values = vectors_df.loc[image_1_index, vector_column]
    image_2_values = vectors_df.loc[image_2_index, vector_column]

    image_1_matrix = np.array(image_1_values)
    image_2_matrix = np.array(image_2_values)

    vector_1_zero_indices = image_1_matrix == 0
    vector_2_zero_indices = image_2_matrix == 0

    image_1_matrix[vector_1_zero_indices] = unique_values[vector_1_zero_indices]
    image_2_matrix[vector_2_zero_indices] = unique_values[vector_2_zero_indices]

    _old_pearsonr = pearsonr(image_1_values, image_2_values)
    [[_old_cosine_similarity]] = cosine_similarity([image_1_values], [image_2_values])
    _pearsonr = pearsonr(image_1_matrix, image_2_matrix)
    [[_cosine_similarity]] = cosine_similarity([image_1_matrix], [image_2_matrix])

    image_1_matrix_smooth = smooth_vector(image_1_matrix)
    image_2_matrix_smooth = smooth_vector(image_2_matrix)
    _pearsonr_smooth = pearsonr(image_1_matrix_smooth, image_2_matrix)
    [[_cosine_similarity_smooth]] = cosine_similarity([image_1_matrix_smooth], [image_2_matrix])

    permuted_indices = np.random.permutation(len(image_1_matrix))
    _pearsonr_random = pearsonr(image_1_matrix[permuted_indices], image_2_matrix[permuted_indices])
    [[_cosine_similarity_random]] = cosine_similarity([image_1_matrix[permuted_indices]], [image_2_matrix[permuted_indices]])

    if plot:
        plt.figure(figsize=(12, 6))
        plt.plot(image_1_values, label='image_1_values', color = 'red')
        plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', color = 'blue')
        # plt.plot(image_1_matrix, label='image_1_matrix', linestyle='--', color = 'blue')
        # plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', linestyle='--', color = "green")
        plt.show()

    return {
        'old_pearsonr'              : f'{round(_old_pearsonr.statistic, 4)} - {_old_pearsonr.pvalue}',
        'old_cosine_similarity'     : round(_old_cosine_similarity, 4),
        'pearsonr'                  : f'{round(_pearsonr.statistic, 4)} - {_pearsonr.pvalue}',
        'cosine_similarity'         : round(_cosine_similarity, 4),
        'pearsonr_smooth'           : f'{round(_pearsonr_smooth.statistic, 4)} - {_pearsonr_smooth.pvalue}',
        'cosine_similarity_smooth'  : round(_cosine_similarity_smooth, 4),
        'pearsonr_random'           : f'{round(_pearsonr_random.statistic, 4)} - {_pearsonr_random.pvalue}',
        'cosine_similarity_random'  : round(_cosine_similarity_random, 4),
    }


In [None]:
get_stats(19569)

In [None]:
correlation_results = []
for i in tqdm.tqdm(range(len(correlation_results), len(vectors_df))):
    correlation_results.append(get_stats(i))

In [None]:
columns = list(correlation_results[0].keys())
fig, axes = plt.subplots(4, 2, figsize=(12, 12))
axes = axes.flatten()
for i, column in enumerate(columns):
    ax = axes[i]
    ax.hist([j[column][0] for j in correlation_results], bins=100)
    ax.set_title(column)

In [None]:
def correlation_fn(index: int):
    vectors = vectors_df.loc[index, 'vectors']
    weighted_vectors = vectors_df.loc[index, 'weighted_vectors']
    reduced_vectors = vectors_df.loc[index, 'reduced_vectors']
    reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']
    return {
        'vectors vs weighted_vectors': pearsonr(vectors, weighted_vectors),
        'vectors vs reduced_vectors': pearsonr(vectors, reduced_vectors),
        'vectors vs reduced_weighted_vectors': pearsonr(vectors, reduced_weighted_vectors),
        'weighted_vectors vs reduced_vectors': pearsonr(weighted_vectors, reduced_vectors),
        'weighted_vectors vs reduced_weighted_vectors': pearsonr(weighted_vectors, reduced_weighted_vectors),
        'reduced_vectors vs reduced_weighted_vectors': pearsonr(reduced_vectors, reduced_weighted_vectors),
    }

correlation_results_2 = [correlation_fn(i) for i in tqdm.tqdm(range(len(vectors_df)))]

In [None]:
import matplotlib.pyplot as plt

columns = list(correlation_results_2[0].keys())
fig, axes = plt.subplots(6, 2, figsize=(24, 24))
axes = axes.flatten()
for i, column in enumerate(columns):
    ax = axes[i]
    corr = [j[column][0] for j in correlation_results_2]
    pvalues = [j[column][1] for j in correlation_results_2]
    # ax.hist([j[column][0] for j in correlation_results_2], bins=100)
    ax.plot(range(0, len(corr)), corr, label='Correlation', color='blue')
    # ax.plot(range(0, len(pvalues)), pvalues, label='pvalues', color='red')
    ax.set_title(column)

In [None]:
import matplotlib.pyplot as plt

columns = list(correlation_results_2[0].keys())
fig, axes = plt.subplots(3, 2, figsize=(24, 24))
axes = axes.flatten()
for i, column in enumerate(columns):
    ax = axes[i]
    corr = [j[column][0] for j in correlation_results_2]
    pvalues = [j[column][1] for j in correlation_results_2]
    ax.plot(range(0, len(corr)), corr, label='correlation', color='blue')
    ax.plot(range(0, len(pvalues)), pvalues, label='p-value', color='red')
    ax.legend(bbox_to_anchor=(1, 0.1), loc='lower right')
    ax.set_ylabel('correlation & p-value')
    ax.set_xlabel(f'images - {column}')
    ax.set_title(column)

fig.savefig('/Users/charleskabue/Downloads/vector-correlations.png')

<hr/>

In [None]:
# vector_columns = ['vectors_column', 'weighted_vectors_column', 'reduced_vectors_column', 'reduced_weighted_vectors_column']
# similarities_json = {}
# for vector_column in tqdm.tqdm(vector_columns):
#     with  zipfile.ZipFile(f'./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/{vector_column}.json.zip', "r") as zip_ref:
#         similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))
#         similarities_json[vector_column] = similarity_vectors_json
@cache
def get_similarities(filter, vector_column: str = 'vectors_column'):
    with  zipfile.ZipFile(f'./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/{vector_column}.json.zip', "r") as zip_ref:
        similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))
    results = [value for value in tqdm.tqdm(similarity_vectors_json) if (filter(value) if filter else True)]
    results.sort(key=lambda similarity: similarity['cosine_similarity_score'], reverse=True)
    similarity_vectors_json = None
    return results

In [None]:
duplicates_matches = get_similarities(
    lambda similarity: similarity['cosine_similarity_score'] < 1 and  similarity['document_image_1'] == similarity['document_image_2'], 
    'reduced_weighted_vectors_column')

len(duplicates_matches)

In [None]:
top_matches = get_similarities(
    lambda similarity: similarity['cosine_similarity_score'] > 0.8 and  similarity['document_image_1'] != similarity['document_image_2'], 
    'reduced_weighted_vectors_column')

In [None]:
def get_image(filename: str):
    return Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{filename}')

def print_matches(matches, *, per_side = 1, figsize = None, startistics = True):
    images = [
        [
            get_image(match['document_image_1']), 
            get_image(match['document_image_2']),
            
        ] + ([
            draw_text_on_image(
                Image.new("RGB", (800, 1200), 'white'),
                [100, 100],
                json.dumps(
                    get_modified_stats(
                        int(match['document_image_1'].split('.')[0]), 
                        int(match['document_image_2'].split('.')[0]), 
                        'vectors'), 
                    indent=4),
                label_text_size=40,
                label_fill_color='white')
        ] if startistics else [])
        for match
        in matches
    ]
    titles = [
        [
            f"{match['document_image_1']}, Similarity - {round(match['cosine_similarity_score'], 4)}" if startistics else match['document_image_1'],
            match['document_image_2'],
        ] + (['More Statistics'] if startistics else [])
        for match
        in matches
    ]
    width_parts = len(images[0]) * per_side
    tile_image = show_tile_images(
        images = flatten(images),
        titles = flatten(titles),
        width_parts = width_parts,
        figsize = figsize or (10.2 * width_parts, 30 * (len(images) / width_parts)),
        space = 2,
        pad = True,
        figcolor = '#d3eddd',
        title_color = 'white',
        title_background_color = 'black',
        title_font_size = 25)
    return tile_image

len([i for i in top_matches if i['cosine_similarity_score'] >= 1])

In [None]:
print_matches(top_matches[0:28])

In [None]:
index = 44
print(top_matches[index]['document_image_1'] + ' - ' + top_matches[index]['document_image_2'])
draw_text_on_image(
    print_matches([top_matches[index]], figsize=(10, 7)),
    [330, 335],
    f"cosine similarity - {round(top_matches[index]['cosine_similarity_score'], 4)}",
    label_text_size=30,
    label_fill_color='black',
    label_text_color='white',
    label_rotate_angle = 90,
    label_text_padding = 2
)

In [None]:
print(duplicates_matches[0])
print_matches(duplicates_matches[:10])

In [None]:
from main import app
import os

model_path = '../detectron2-layout-parser/model_final.pth'
config_path = '../detectron2-layout-parser/config.yaml'

examples = [f'./demo-examples/{filename}' for filename in os.listdir('./demo-examples/')]
app(model_path=model_path, config_path=config_path, examples=examples, debug=True)

In [None]:
import os
from PIL import Image
import layoutparser as lp
from utils.get_features import get_features

documents = os.listdir('./data/local-data/raw/RVL-CDIP-invoice')
# model_path = './model/trained_model/model_final.pth'
# config_path = './model/trained_model/config.yaml'
model_path = '../detectron2-layout-parser/model_final.pth'
config_path = '../detectron2-layout-parser/config.yaml'
label_map = {0: 'Caption', 1: 'Footnote', 2: 'Formula', 3: 'List-item', 
             4: 'Page-footer', 5: 'Page-header', 6: 'Picture', 
             7: 'Section-header', 8: 'Table', 9: 'Text', 10: 'Title'}
model = lp.Detectron2LayoutModel(
    config_path=config_path,
    model_path=model_path,
    label_map=label_map)

for document in documents[0:1]:
    features = get_features(
        image=Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{document}'),
        model=model,
        label_names=list(label_map.values()),
        width_parts=100,
        height_parts=100)