Spaces:

mariagrandury
/

language-gap-in-hf-hub

Sleeping

App Files Files Community

mariagrandury commited on May 21

Commit

a938b8a

1 Parent(s): 2a275ae

specify resource type in plot names

Browse files

Files changed (12) hide show

.DS_Store +0 -0
app.py +37 -15
hub_datasets_by_language.py +8 -8
hub_models_by_language.py +131 -0
plots/{bar_plot_horizontal.png → datasets_bar_plot_horizontal.png} +0 -0
plots/{bar_plot_vertical.png → datasets_bar_plot_vertical.png} +0 -0
plots/{pie_chart.png → datasets_pie_chart.png} +0 -0
plots/{stack_area.png → datasets_stack_area.png} +0 -0
plots/{stack_area_en_es.png → datasets_stack_area_en_es.png} +0 -0
plots/{stack_area_es.png → datasets_stack_area_es.png} +0 -0
plots/{stack_area_es_ca_gl_eu.png → datasets_stack_area_es_ca_gl_eu.png} +0 -0
plots/{time_series.png → datasets_time_series.png} +0 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app.py CHANGED Viewed

@@ -3,17 +3,24 @@ import subprocess
 import gradio as gr
-def run_script():
     try:
-        result = subprocess.run(
             ["python", "hub_datasets_by_language.py"],
             capture_output=True,
             text=True,
             check=True,
         )
-        return "Script executed successfully! Plots have been updated."
     except subprocess.CalledProcessError as e:
-        return f"Failed to execute script: {str(e.stderr)}"
 def create_app():
@@ -30,42 +37,57 @@ def create_app():
             """
             ## English vs Spanish Monolingual Datasets
-            Note: We consider only **monolingual** datasets in these plots, i.e. datasets that only contain data in one language. This is because *most* of the multilingual datasets are usually machine-translated and we want to focus on original data.
         """
         )
         with gr.Row():
             with gr.Column():
                 gr.Image(
-                    value="plots/bar_plot_horizontal.png",
-                    label="Distribution by Year (Horizontal)",
                     show_label=True,
                     show_download_button=True,
                     show_share_button=True,
                 )
                 gr.Image(
-                    value="plots/stack_area_en_es.png",
-                    label="Stacked Area Plot",
                     show_label=True,
                     show_download_button=True,
                     show_share_button=True,
                 )
             with gr.Column():
                 gr.Image(
-                    value="plots/bar_plot_vertical.png",
-                    label="Distribution by Year (Vertical)",
                     show_label=True,
                     show_download_button=True,
                     show_share_button=True,
                 )
                 gr.Image(
-                    value="plots/time_series.png",
-                    label="Cumulative Growth Over Time",
                     show_label=True,
                     show_download_button=True,
                     show_share_button=True,
                 )
         with gr.Row():
             update_button = gr.Button("Update Plots with Latest Data")
             output_label = gr.Label()
@@ -74,7 +96,7 @@ def create_app():
             """
             ## Adapt to other languages
-            This Space is WIP and more languages and visuals will be included shortly. Meanwhile, you can clone the Space, adapt the code in the script and run it to generate plots for other languages.
             """
         )
@@ -97,7 +119,7 @@ def create_app():
             )
         update_button.click(
-            fn=run_script,
             outputs=output_label,
         )

 import gradio as gr
+def run_scripts():
     try:
+        # Execute both scripts
+        subprocess.run(
             ["python", "hub_datasets_by_language.py"],
             capture_output=True,
             text=True,
             check=True,
         )
+        # subprocess.run(
+        #     ["python", "hub_models_by_language.py"],
+        #     capture_output=True,
+        #     text=True,
+        #     check=True,
+        # )
+        return "Scripts executed successfully! All plots have been updated."
     except subprocess.CalledProcessError as e:
+        return f"Failed to execute scripts: {str(e.stderr)}"
 def create_app():
             """
             ## English vs Spanish Monolingual Datasets
+            Note: We consider only **monolingual** resources in these plots, i.e. datasets and models that only contain data in one language. This is because *most* of the multilingual resources are usually machine-translated and we want to focus on original data.
         """
         )
         with gr.Row():
             with gr.Column():
                 gr.Image(
+                    value="plots/datasets_bar_plot_horizontal.png",
+                    label="Distribution of Datasets by Year (Horizontal)",
                     show_label=True,
                     show_download_button=True,
                     show_share_button=True,
                 )
                 gr.Image(
+                    value="plots/datasets_stack_area_en_es.png",
+                    label="Cumulative Growth of Datasets (Stacked)",
                     show_label=True,
                     show_download_button=True,
                     show_share_button=True,
                 )
             with gr.Column():
                 gr.Image(
+                    value="plots/datasets_bar_plot_vertical.png",
+                    label="Distribution of Datasets by Year (Vertical)",
                     show_label=True,
                     show_download_button=True,
                     show_share_button=True,
                 )
                 gr.Image(
+                    value="plots/datasets_time_series.png",
+                    label="Cumulative Growth of Datasets (Line)",
                     show_label=True,
                     show_download_button=True,
                     show_share_button=True,
                 )
+        # gr.Markdown(
+        #     """
+        #     ## English vs Spanish Models
+        #     """
+        # )
+        # with gr.Row():
+        #     gr.Image(
+        #         value="plots/models_stack_area_en_es.png",
+        #         label="Cumulative Growth of Models",
+        #         show_label=True,
+        #         show_download_button=True,
+        #         show_share_button=True,
+        #     )
         with gr.Row():
             update_button = gr.Button("Update Plots with Latest Data")
             output_label = gr.Label()
             """
             ## Adapt to other languages
+            This Space is WIP and more languages and visuals will be included shortly. Meanwhile, you can clone the Space, adapt the code in the scripts and run it to generate plots for other languages.
             """
         )
             )
         update_button.click(
+            fn=run_scripts,
             outputs=output_label,
         )

hub_datasets_by_language.py CHANGED Viewed

@@ -133,7 +133,7 @@ def create_bar_plots(datasets, output_dir):
     plt.legend()
     plt.grid(GRID)
     plt.tight_layout()
-    plt.savefig(f"{output_dir}/bar_plot_horizontal.png")
     plt.close()
     # Vertical bar plot
@@ -160,7 +160,7 @@ def create_bar_plots(datasets, output_dir):
     plt.legend()
     plt.tight_layout()
     plt.grid(GRID)
-    plt.savefig(f"{output_dir}/bar_plot_vertical.png")
     plt.close()
@@ -184,7 +184,7 @@ def create_pie_chart(datasets, output_dir):
         ],
     )
     plt.axis("equal")
-    plt.savefig(f"{output_dir}/pie_chart.png")
     plt.close()
@@ -231,7 +231,7 @@ def create_time_series(datasets, output_dir):
     plt.legend(loc="upper left")
     plt.tight_layout()
     plt.grid(GRID)
-    plt.savefig(f"{output_dir}/time_series.png")
     plt.close()
@@ -275,7 +275,7 @@ def create_stack_area_plots(datasets, output_dir):
     plt.legend(loc="upper left")
     plt.tight_layout()
     plt.grid(GRID)
-    plt.savefig(f"{output_dir}/stack_area.png")
     plt.close()
     # Plot stacked area for all except English
@@ -299,7 +299,7 @@ def create_stack_area_plots(datasets, output_dir):
     plt.legend(loc="upper left")
     plt.tight_layout()
     plt.grid(GRID)
-    plt.savefig(f"{output_dir}/stack_area_es_ca_gl_eu.png")
     plt.close()
     # Plot stacked area for English and Spanish
@@ -317,7 +317,7 @@ def create_stack_area_plots(datasets, output_dir):
     plt.legend(loc="upper left")
     plt.tight_layout()
     plt.grid(GRID)
-    plt.savefig(f"{output_dir}/stack_area_en_es.png")
     plt.close()
     # Plot stacked area for Spanish only
@@ -335,7 +335,7 @@ def create_stack_area_plots(datasets, output_dir):
     plt.legend(loc="upper left")
     plt.tight_layout()
     plt.grid(GRID)
-    plt.savefig(f"{output_dir}/stack_area_es.png")
     plt.close()

     plt.legend()
     plt.grid(GRID)
     plt.tight_layout()
+    plt.savefig(f"{output_dir}/datasets_bar_plot_horizontal.png")
     plt.close()
     # Vertical bar plot
     plt.legend()
     plt.tight_layout()
     plt.grid(GRID)
+    plt.savefig(f"{output_dir}/datasets_bar_plot_vertical.png")
     plt.close()
         ],
     )
     plt.axis("equal")
+    plt.savefig(f"{output_dir}/datasets_pie_chart.png")
     plt.close()
     plt.legend(loc="upper left")
     plt.tight_layout()
     plt.grid(GRID)
+    plt.savefig(f"{output_dir}/datasets_time_series.png")
     plt.close()
     plt.legend(loc="upper left")
     plt.tight_layout()
     plt.grid(GRID)
+    plt.savefig(f"{output_dir}/datasets_stack_area.png")
     plt.close()
     # Plot stacked area for all except English
     plt.legend(loc="upper left")
     plt.tight_layout()
     plt.grid(GRID)
+    plt.savefig(f"{output_dir}/datasets_stack_area_es_ca_gl_eu.png")
     plt.close()
     # Plot stacked area for English and Spanish
     plt.legend(loc="upper left")
     plt.tight_layout()
     plt.grid(GRID)
+    plt.savefig(f"{output_dir}/datasets_stack_area_en_es.png")
     plt.close()
     # Plot stacked area for Spanish only
     plt.legend(loc="upper left")
     plt.tight_layout()
     plt.grid(GRID)
+    plt.savefig(f"{output_dir}/datasets_stack_area_es.png")
     plt.close()

hub_models_by_language.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import pickle
+from datetime import datetime
+import matplotlib.pyplot as plt
+import pandas as pd
+from huggingface_hub import HfApi
+# Define colors for each language
+LANGUAGE_COLORS = {
+    "english": "orange",
+    "spanish": "blue",
+}
+def fetch_models(cache_file="models_cache.pkl"):
+    """Fetch and filter models from HuggingFace Hub with caching"""
+    # Check if cached data exists and is less than 24 hours old
+    if os.path.exists(cache_file):
+        cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
+        if cache_age < 24 * 3600:  # 24 hours in seconds
+            print("Loading models from cache...")
+            with open(cache_file, "rb") as f:
+                return pickle.load(f)
+        else:
+            print("Cache is older than 24 hours, fetching fresh data...")
+    else:
+        print("No cache found, fetching models from Hugging Face Hub...")
+    hf_api = HfApi()
+    all_models = list(hf_api.list_models(full=True))
+    # Filter models by language
+    english_filter = filter(
+        lambda m: any(tag == "language:en" for tag in m.tags)
+        and not any(
+            tag.startswith("language:") and tag != "language:en" for tag in m.tags
+        ),
+        all_models,
+    )
+    spanish_filter = filter(
+        lambda m: any(tag == "language:es" for tag in m.tags)
+        and not any(
+            tag.startswith("language:") and tag != "language:es" for tag in m.tags
+        ),
+        all_models,
+    )
+    filtered_models = {
+        "english": list(english_filter),
+        "spanish": list(spanish_filter),
+    }
+    # Cache the filtered models
+    print("Saving models to cache...")
+    with open(cache_file, "wb") as f:
+        pickle.dump(filtered_models, f)
+    return filtered_models
+def create_stack_area_plot(models, output_dir):
+    """Create stacked area plot for English and Spanish models"""
+    # Prepare data for all languages
+    all_dates = []
+    languages = ["english", "spanish"]
+    for lang in languages:
+        all_dates.extend([d.created_at.date() for d in models[lang]])
+    if not all_dates:
+        print("No models found for any language. Skipping plot creation.")
+        return
+    # Create a common date range for all languages
+    min_date = min(all_dates)
+    max_date = max(all_dates)
+    date_range = pd.date_range(start=min_date, end=max_date, freq="MS")
+    # Create separate DataFrames for each language
+    dfs = {}
+    for lang in languages:
+        dates = [d.created_at.date() for d in models[lang]]
+        df = pd.DataFrame({"Date": dates})
+        df["Count"] = 1
+        df["Date"] = pd.to_datetime(df["Date"])
+        # Reindex to common date range and fill missing values with 0
+        df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum()
+        df_grouped = df_grouped.reindex(date_range, fill_value=0)
+        dfs[lang] = df_grouped.cumsum()
+    # Plot stacked area for English and Spanish
+    plt.figure(figsize=(10, 6))
+    plt.stackplot(
+        date_range,
+        [dfs[lang]["Count"].values for lang in languages],
+        labels=["English", "Spanish"],
+        colors=[LANGUAGE_COLORS[lang] for lang in languages],
+    )
+    plt.xlabel("Date", fontsize=10)
+    plt.ylabel("Cumulative Number of Models", fontsize=10)
+    plt.xticks(rotation=45, fontsize=10)
+    plt.legend(loc="upper left")
+    plt.tight_layout()
+    plt.savefig(f"{output_dir}/models_stack_area_en_es.png")
+    plt.close()
+def main():
+    # Create output directory if it doesn't exist
+    output_dir = "plots"
+    os.makedirs(output_dir, exist_ok=True)
+    # Fetch models
+    print("Fetching models from Hugging Face Hub...")
+    models = fetch_models()
+    # Print model counts
+    print("\nModel counts:")
+    for lang, models_list in models.items():
+        print(f"{lang.capitalize()}: {len(models_list)}")
+    # Create visualization
+    print("\nCreating stack area plot...")
+    create_stack_area_plot(models, output_dir)
+    print(f"Plot has been saved to the '{output_dir}' directory")
+if __name__ == "__main__":
+    main()

plots/{bar_plot_horizontal.png → datasets_bar_plot_horizontal.png} RENAMED Viewed

File without changes

plots/{bar_plot_vertical.png → datasets_bar_plot_vertical.png} RENAMED Viewed

File without changes

plots/{pie_chart.png → datasets_pie_chart.png} RENAMED Viewed

File without changes

plots/{stack_area.png → datasets_stack_area.png} RENAMED Viewed

File without changes

plots/{stack_area_en_es.png → datasets_stack_area_en_es.png} RENAMED Viewed

File without changes

plots/{stack_area_es.png → datasets_stack_area_es.png} RENAMED Viewed

File without changes

plots/{stack_area_es_ca_gl_eu.png → datasets_stack_area_es_ca_gl_eu.png} RENAMED Viewed

File without changes

plots/{time_series.png → datasets_time_series.png} RENAMED Viewed

File without changes