{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "IsB9l3mBIGUN" }, "source": [ "## Analysis" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import pandas as pd\n", "from PIL import Image\n", "from scipy.stats import pearsonr\n", "from utils.get_unique_values import get_unique_values\n", "from utils.remove_duplicates import unzip_fn\n", "from utils.show_tile_images import show_tile_images\n", "import zipfile\n", "import json\n", "from utils.visualize_bboxes_on_image import draw_text_on_image\n", "import numpy as np\n", "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5l6iv7ZrIGUP" }, "outputs": [], "source": [ "# !GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features --depth=1\n", "\n", "# !wget https://huggingface.co/spaces/mckabue/document-similarity-search-using-visual-layout-features/resolve/main/data/processed/RVL-CDIP-invoice/vectors.json.zip -P ./data/processed/RVL-CDIP-invoice/\n", "\n", "\n", "\n", "# import sys\n", "# sys.path.insert(0, './document-similarity-search-using-visual-layout-features')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "172P8Ey8ytD9" }, "outputs": [], "source": [ "# import os\n", "# vectors_chunks = os.listdir('/content/document-similarity-search-using-visual-layout-features/data/processed/RVL-CDIP-invoice/vectors.json.zip.chunks')\n", "# vectors_chunks.sort(key=lambda x: int(x.split('-')[0]))\n", "# vectors_chunks" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZZD9JBaWa_T_" }, "outputs": [], "source": [ "vectors_df = pd.read_json('./data/local-data/processed/RVL-CDIP-invoice/vectors.json.zip')\n", "vectors_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# https://gemini.google.com/app/8cd4389df12d29e6\n", "\n", "# https://chat.openai.com/c/a345a9ec-9238-4089-a6c0-bb4d375148eb" ] }, { "cell_type": "markdown", "metadata": { "id": "X0n7rBnZIGUQ" }, "source": [ "### Correlation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "unique_values = get_unique_values(start=0.17, end=1, count=10*1000)\n", "\n", "def get_stats(index: int):\n", " vectors = vectors_df.loc[index, 'vectors']\n", " weighted_vectors = vectors_df.loc[index, 'weighted_vectors']\n", " reduced_vectors = vectors_df.loc[index, 'reduced_vectors']\n", " reduced_weighted_vectors = vectors_df.loc[index, 'reduced_weighted_vectors']\n", " non_zero_vectors, non_zero_uniques = unzip_fn([(vector, unique) for vector, unique in zip(vectors, unique_values) if vector > 0])\n", "\n", " non_zero_vectors__uniques = pearsonr(non_zero_vectors, non_zero_uniques)\n", " vectors___unique_values = pearsonr(vectors, unique_values)\n", " vectors___weighted_vectors = pearsonr(vectors, weighted_vectors)\n", " vectors___reduced_vectors = pearsonr(vectors, reduced_vectors)\n", " vectors___reduced_weighted_vectors = pearsonr(vectors, reduced_weighted_vectors)\n", " weighted_vectors___reduced_vectors = pearsonr(weighted_vectors, reduced_vectors)\n", " weighted_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n", " reduced_vectors___reduced_weighted_vectors = pearsonr(weighted_vectors, reduced_weighted_vectors)\n", "\n", " return {\n", " 'non_zero_vectors__uniques': non_zero_vectors__uniques,\n", " 'vectors___unique_values': vectors___unique_values,\n", " 'vectors___weighted_vectors': vectors___weighted_vectors,\n", " 'vectors___reduced_vectors': vectors___reduced_vectors,\n", " 'vectors___reduced_weighted_vectors': vectors___reduced_weighted_vectors,\n", " 'weighted_vectors___reduced_vectors': weighted_vectors___reduced_vectors,\n", " 'weighted_vectors___reduced_weighted_vectors': weighted_vectors___reduced_weighted_vectors,\n", " 'reduced_vectors___reduced_weighted_vectors': reduced_vectors___reduced_weighted_vectors,\n", " }\n", "\n", "from matplotlib import pyplot as plt\n", "from scipy.signal import convolve\n", "kernel = np.array([0.25, 0.5, 0.25]) # Example kernel for simple averaging\n", "\n", "def smooth_vector(vector):\n", " # Perform convolution\n", " smoothed_vector = convolve(vector, kernel, mode='same') / sum(kernel)\n", " return smoothed_vector\n", "\n", "def get_modified_stats(image_1_index: int, image_2_index: int, vector_column: str = 'vectors', plot = False):\n", " image_1_values = vectors_df.loc[image_1_index, vector_column]\n", " image_2_values = vectors_df.loc[image_2_index, vector_column]\n", "\n", " image_1_matrix = np.array(image_1_values)\n", " image_2_matrix = np.array(image_2_values)\n", "\n", " vector_1_zero_indices = image_1_matrix == 0\n", " vector_2_zero_indices = image_2_matrix == 0\n", "\n", " image_1_matrix[vector_1_zero_indices] = unique_values[vector_1_zero_indices]\n", " image_2_matrix[vector_2_zero_indices] = unique_values[vector_2_zero_indices]\n", "\n", " _old_pearsonr = pearsonr(image_1_values, image_2_values)\n", " [[_old_cosine_similarity]] = cosine_similarity([image_1_values], [image_2_values])\n", " _pearsonr = pearsonr(image_1_matrix, image_2_matrix)\n", " [[_cosine_similarity]] = cosine_similarity([image_1_matrix], [image_2_matrix])\n", "\n", " image_1_matrix_smooth = smooth_vector(image_1_matrix)\n", " image_2_matrix_smooth = smooth_vector(image_2_matrix)\n", " _pearsonr_smooth = pearsonr(image_1_matrix_smooth, image_2_matrix)\n", " [[_cosine_similarity_smooth]] = cosine_similarity([image_1_matrix_smooth], [image_2_matrix])\n", "\n", " permuted_indices = np.random.permutation(len(image_1_matrix))\n", " _pearsonr_random = pearsonr(image_1_matrix[permuted_indices], image_2_matrix[permuted_indices])\n", " [[_cosine_similarity_random]] = cosine_similarity([image_1_matrix[permuted_indices]], [image_2_matrix[permuted_indices]])\n", "\n", " if plot:\n", " plt.figure(figsize=(12, 6))\n", " plt.plot(image_1_values, label='image_1_values', color = 'red')\n", " plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', color = 'blue')\n", " # plt.plot(image_1_matrix, label='image_1_matrix', linestyle='--', color = 'blue')\n", " # plt.plot(image_1_matrix_smooth, label='image_1_matrix_smooth', linestyle='--', color = \"green\")\n", " plt.show()\n", "\n", " return {\n", " 'old_pearsonr' : f'{round(_old_pearsonr.statistic, 4)} - {_old_pearsonr.pvalue}',\n", " 'old_cosine_similarity' : round(_old_cosine_similarity, 4),\n", " 'pearsonr' : f'{round(_pearsonr.statistic, 4)} - {_pearsonr.pvalue}',\n", " 'cosine_similarity' : round(_cosine_similarity, 4),\n", " 'pearsonr_smooth' : f'{round(_pearsonr_smooth.statistic, 4)} - {_pearsonr_smooth.pvalue}',\n", " 'cosine_similarity_smooth' : round(_cosine_similarity_smooth, 4),\n", " 'pearsonr_random' : f'{round(_pearsonr_random.statistic, 4)} - {_pearsonr_random.pvalue}',\n", " 'cosine_similarity_random' : round(_cosine_similarity_random, 4),\n", " }\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_stats(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with zipfile.ZipFile('./data/local-data/processed/RVL-CDIP-invoice/cosine_similarity_scores/vectors_column.json.zip', \"r\") as zip_ref:\n", " similarity_vectors_json = json.loads(zip_ref.read(zip_ref.filelist[0].filename))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_matches = [\n", " similarity for similarity in \n", " similarity_vectors_json \n", " if similarity['cosine_similarity_score'] > 0.8 and \n", " similarity['document_image_1'] != similarity['document_image_2']]\n", "top_matches.sort(key=lambda similarity: similarity['cosine_similarity_score'], reverse=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_image(filename: str):\n", " return Image.open(f'./data/local-data/raw/RVL-CDIP-invoice/{filename}')\n", "\n", "def print_matches(matches, two_column_count, *, start = 0):\n", " images_range = range(start, start + two_column_count)\n", " images = np.array(\n", " [\n", " [\n", " get_image(matches[i]['document_image_1']), \n", " get_image(matches[i]['document_image_2']),\n", " draw_text_on_image(\n", " Image.new(\"RGB\", (800, 1200), 'white'),\n", " [100, 100],\n", " json.dumps(\n", " get_modified_stats(\n", " int(matches[i]['document_image_1'].split('.')[0]), \n", " int(matches[i]['document_image_2'].split('.')[0]), \n", " 'vectors'), \n", " indent=4),\n", " label_text_size=40,\n", " label_rectangle_color='white',\n", " ),\n", " ]\n", " for i\n", " in images_range\n", " ],\n", " dtype=\"object\").flatten().tolist()\n", " titles = np.array(\n", " [\n", " [\n", " f\"{matches[i]['document_image_1']}, Similarity - {round(matches[i]['cosine_similarity_score'], 4)}\", \n", " matches[i]['document_image_2'],\n", " 'More Statistics',\n", " ]\n", " for i\n", " in images_range\n", " ]).flatten().tolist()\n", " width_parts = 3\n", " return show_tile_images(\n", " images,\n", " titles = titles,\n", " width_parts = width_parts,\n", " figsize = (10.2 * width_parts, 12 * (len(images) / width_parts)),\n", " space = 2,\n", " pad = True,\n", " figcolor = '#d3eddd',\n", " title_color = 'black',\n", " title_background_color = 'white',\n", " title_font_size = 30)\n", "\n", "print_matches(top_matches, 2, start=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "almost_similar = [similarity for similarity in \n", " similarity_vectors_json \n", " if similarity['cosine_similarity_score'] > 0.9 and similarity['cosine_similarity_score'] < 1.0]\n", "almost_similar.sort(key=lambda similarity: similarity['cosine_similarity_score'], reverse=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print_matches(almost_similar, 5, start=0)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7862\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/charleskabue/miniconda3/envs/dss-env/lib/python3.10/site-packages/torch/functional.py:504: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1695391816234/work/aten/src/ATen/native/TensorShape.cpp:3527.)\n", " return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Keyboard interruption in main thread... closing server.\n" ] }, { "data": { "text/plain": [] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from main import app\n", "\n", "model_path = '../detectron2-layout-parser/model_final.pth'\n", "config_path = '../detectron2-layout-parser/config.yaml'\n", "examples = [\n", " '/Users/charleskabue/document-similarity-search/detectron2-layout-parser/example.1.jpg',\n", " '/Users/charleskabue/document-similarity-search/detectron2-layout-parser/example.2.jpg',\n", " '/Users/charleskabue/document-similarity-search/detectron2-layout-parser/example.3.jpg',\n", " ] * 5\n", "app(model_path=model_path, config_path=config_path, examples=examples, debug=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7861\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "import gradio as gr\n", "\n", "def classify_image(image):\n", " # Replace with your image classification logic\n", " # (e.g., using a pre-trained model or custom implementation)\n", " classification_results = {\"cat\": 0.8, \"dog\": 0.2}\n", " return classification_results\n", "\n", "image_input = gr.components.Image(type=\"pil\") # Allow image upload\n", "label = gr.components.Label(num_top_classes=3) # Show top 3 predictions\n", "\n", "interface = gr.Interface(classify_image, inputs=image_input, outputs=label)\n", "interface.launch()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7861\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Keyboard interruption in main thread... closing server.\n" ] }, { "data": { "text/plain": [] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with gr.Blocks() as interface:\n", " document = gr.Image(\n", " type=\"pil\", label=f\"Document\", interactive=False, show_download_button=True)\n", " gr.Examples(\n", " examples=examples,\n", " inputs=document,\n", " label='Select any of these test document images')\n", "interface.launch(debug=True)" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 0 }