Commit
Β·
a938b8a
1
Parent(s):
2a275ae
specify resource type in plot names
Browse files- .DS_Store +0 -0
- app.py +37 -15
- hub_datasets_by_language.py +8 -8
- hub_models_by_language.py +131 -0
- plots/{bar_plot_horizontal.png β datasets_bar_plot_horizontal.png} +0 -0
- plots/{bar_plot_vertical.png β datasets_bar_plot_vertical.png} +0 -0
- plots/{pie_chart.png β datasets_pie_chart.png} +0 -0
- plots/{stack_area.png β datasets_stack_area.png} +0 -0
- plots/{stack_area_en_es.png β datasets_stack_area_en_es.png} +0 -0
- plots/{stack_area_es.png β datasets_stack_area_es.png} +0 -0
- plots/{stack_area_es_ca_gl_eu.png β datasets_stack_area_es_ca_gl_eu.png} +0 -0
- plots/{time_series.png β datasets_time_series.png} +0 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
app.py
CHANGED
@@ -3,17 +3,24 @@ import subprocess
|
|
3 |
import gradio as gr
|
4 |
|
5 |
|
6 |
-
def
|
7 |
try:
|
8 |
-
|
|
|
9 |
["python", "hub_datasets_by_language.py"],
|
10 |
capture_output=True,
|
11 |
text=True,
|
12 |
check=True,
|
13 |
)
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
except subprocess.CalledProcessError as e:
|
16 |
-
return f"Failed to execute
|
17 |
|
18 |
|
19 |
def create_app():
|
@@ -30,42 +37,57 @@ def create_app():
|
|
30 |
"""
|
31 |
## English vs Spanish Monolingual Datasets
|
32 |
|
33 |
-
Note: We consider only **monolingual**
|
34 |
"""
|
35 |
)
|
36 |
|
37 |
with gr.Row():
|
38 |
with gr.Column():
|
39 |
gr.Image(
|
40 |
-
value="plots/
|
41 |
-
label="Distribution by Year (Horizontal)",
|
42 |
show_label=True,
|
43 |
show_download_button=True,
|
44 |
show_share_button=True,
|
45 |
)
|
46 |
gr.Image(
|
47 |
-
value="plots/
|
48 |
-
label="
|
49 |
show_label=True,
|
50 |
show_download_button=True,
|
51 |
show_share_button=True,
|
52 |
)
|
53 |
with gr.Column():
|
54 |
gr.Image(
|
55 |
-
value="plots/
|
56 |
-
label="Distribution by Year (Vertical)",
|
57 |
show_label=True,
|
58 |
show_download_button=True,
|
59 |
show_share_button=True,
|
60 |
)
|
61 |
gr.Image(
|
62 |
-
value="plots/
|
63 |
-
label="Cumulative Growth
|
64 |
show_label=True,
|
65 |
show_download_button=True,
|
66 |
show_share_button=True,
|
67 |
)
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
with gr.Row():
|
70 |
update_button = gr.Button("Update Plots with Latest Data")
|
71 |
output_label = gr.Label()
|
@@ -74,7 +96,7 @@ def create_app():
|
|
74 |
"""
|
75 |
## Adapt to other languages
|
76 |
|
77 |
-
This Space is WIP and more languages and visuals will be included shortly. Meanwhile, you can clone the Space, adapt the code in the
|
78 |
"""
|
79 |
)
|
80 |
|
@@ -97,7 +119,7 @@ def create_app():
|
|
97 |
)
|
98 |
|
99 |
update_button.click(
|
100 |
-
fn=
|
101 |
outputs=output_label,
|
102 |
)
|
103 |
|
|
|
3 |
import gradio as gr
|
4 |
|
5 |
|
6 |
+
def run_scripts():
|
7 |
try:
|
8 |
+
# Execute both scripts
|
9 |
+
subprocess.run(
|
10 |
["python", "hub_datasets_by_language.py"],
|
11 |
capture_output=True,
|
12 |
text=True,
|
13 |
check=True,
|
14 |
)
|
15 |
+
# subprocess.run(
|
16 |
+
# ["python", "hub_models_by_language.py"],
|
17 |
+
# capture_output=True,
|
18 |
+
# text=True,
|
19 |
+
# check=True,
|
20 |
+
# )
|
21 |
+
return "Scripts executed successfully! All plots have been updated."
|
22 |
except subprocess.CalledProcessError as e:
|
23 |
+
return f"Failed to execute scripts: {str(e.stderr)}"
|
24 |
|
25 |
|
26 |
def create_app():
|
|
|
37 |
"""
|
38 |
## English vs Spanish Monolingual Datasets
|
39 |
|
40 |
+
Note: We consider only **monolingual** resources in these plots, i.e. datasets and models that only contain data in one language. This is because *most* of the multilingual resources are usually machine-translated and we want to focus on original data.
|
41 |
"""
|
42 |
)
|
43 |
|
44 |
with gr.Row():
|
45 |
with gr.Column():
|
46 |
gr.Image(
|
47 |
+
value="plots/datasets_bar_plot_horizontal.png",
|
48 |
+
label="Distribution of Datasets by Year (Horizontal)",
|
49 |
show_label=True,
|
50 |
show_download_button=True,
|
51 |
show_share_button=True,
|
52 |
)
|
53 |
gr.Image(
|
54 |
+
value="plots/datasets_stack_area_en_es.png",
|
55 |
+
label="Cumulative Growth of Datasets (Stacked)",
|
56 |
show_label=True,
|
57 |
show_download_button=True,
|
58 |
show_share_button=True,
|
59 |
)
|
60 |
with gr.Column():
|
61 |
gr.Image(
|
62 |
+
value="plots/datasets_bar_plot_vertical.png",
|
63 |
+
label="Distribution of Datasets by Year (Vertical)",
|
64 |
show_label=True,
|
65 |
show_download_button=True,
|
66 |
show_share_button=True,
|
67 |
)
|
68 |
gr.Image(
|
69 |
+
value="plots/datasets_time_series.png",
|
70 |
+
label="Cumulative Growth of Datasets (Line)",
|
71 |
show_label=True,
|
72 |
show_download_button=True,
|
73 |
show_share_button=True,
|
74 |
)
|
75 |
|
76 |
+
# gr.Markdown(
|
77 |
+
# """
|
78 |
+
# ## English vs Spanish Models
|
79 |
+
# """
|
80 |
+
# )
|
81 |
+
|
82 |
+
# with gr.Row():
|
83 |
+
# gr.Image(
|
84 |
+
# value="plots/models_stack_area_en_es.png",
|
85 |
+
# label="Cumulative Growth of Models",
|
86 |
+
# show_label=True,
|
87 |
+
# show_download_button=True,
|
88 |
+
# show_share_button=True,
|
89 |
+
# )
|
90 |
+
|
91 |
with gr.Row():
|
92 |
update_button = gr.Button("Update Plots with Latest Data")
|
93 |
output_label = gr.Label()
|
|
|
96 |
"""
|
97 |
## Adapt to other languages
|
98 |
|
99 |
+
This Space is WIP and more languages and visuals will be included shortly. Meanwhile, you can clone the Space, adapt the code in the scripts and run it to generate plots for other languages.
|
100 |
"""
|
101 |
)
|
102 |
|
|
|
119 |
)
|
120 |
|
121 |
update_button.click(
|
122 |
+
fn=run_scripts,
|
123 |
outputs=output_label,
|
124 |
)
|
125 |
|
hub_datasets_by_language.py
CHANGED
@@ -133,7 +133,7 @@ def create_bar_plots(datasets, output_dir):
|
|
133 |
plt.legend()
|
134 |
plt.grid(GRID)
|
135 |
plt.tight_layout()
|
136 |
-
plt.savefig(f"{output_dir}/
|
137 |
plt.close()
|
138 |
|
139 |
# Vertical bar plot
|
@@ -160,7 +160,7 @@ def create_bar_plots(datasets, output_dir):
|
|
160 |
plt.legend()
|
161 |
plt.tight_layout()
|
162 |
plt.grid(GRID)
|
163 |
-
plt.savefig(f"{output_dir}/
|
164 |
plt.close()
|
165 |
|
166 |
|
@@ -184,7 +184,7 @@ def create_pie_chart(datasets, output_dir):
|
|
184 |
],
|
185 |
)
|
186 |
plt.axis("equal")
|
187 |
-
plt.savefig(f"{output_dir}/
|
188 |
plt.close()
|
189 |
|
190 |
|
@@ -231,7 +231,7 @@ def create_time_series(datasets, output_dir):
|
|
231 |
plt.legend(loc="upper left")
|
232 |
plt.tight_layout()
|
233 |
plt.grid(GRID)
|
234 |
-
plt.savefig(f"{output_dir}/
|
235 |
plt.close()
|
236 |
|
237 |
|
@@ -275,7 +275,7 @@ def create_stack_area_plots(datasets, output_dir):
|
|
275 |
plt.legend(loc="upper left")
|
276 |
plt.tight_layout()
|
277 |
plt.grid(GRID)
|
278 |
-
plt.savefig(f"{output_dir}/
|
279 |
plt.close()
|
280 |
|
281 |
# Plot stacked area for all except English
|
@@ -299,7 +299,7 @@ def create_stack_area_plots(datasets, output_dir):
|
|
299 |
plt.legend(loc="upper left")
|
300 |
plt.tight_layout()
|
301 |
plt.grid(GRID)
|
302 |
-
plt.savefig(f"{output_dir}/
|
303 |
plt.close()
|
304 |
|
305 |
# Plot stacked area for English and Spanish
|
@@ -317,7 +317,7 @@ def create_stack_area_plots(datasets, output_dir):
|
|
317 |
plt.legend(loc="upper left")
|
318 |
plt.tight_layout()
|
319 |
plt.grid(GRID)
|
320 |
-
plt.savefig(f"{output_dir}/
|
321 |
plt.close()
|
322 |
|
323 |
# Plot stacked area for Spanish only
|
@@ -335,7 +335,7 @@ def create_stack_area_plots(datasets, output_dir):
|
|
335 |
plt.legend(loc="upper left")
|
336 |
plt.tight_layout()
|
337 |
plt.grid(GRID)
|
338 |
-
plt.savefig(f"{output_dir}/
|
339 |
plt.close()
|
340 |
|
341 |
|
|
|
133 |
plt.legend()
|
134 |
plt.grid(GRID)
|
135 |
plt.tight_layout()
|
136 |
+
plt.savefig(f"{output_dir}/datasets_bar_plot_horizontal.png")
|
137 |
plt.close()
|
138 |
|
139 |
# Vertical bar plot
|
|
|
160 |
plt.legend()
|
161 |
plt.tight_layout()
|
162 |
plt.grid(GRID)
|
163 |
+
plt.savefig(f"{output_dir}/datasets_bar_plot_vertical.png")
|
164 |
plt.close()
|
165 |
|
166 |
|
|
|
184 |
],
|
185 |
)
|
186 |
plt.axis("equal")
|
187 |
+
plt.savefig(f"{output_dir}/datasets_pie_chart.png")
|
188 |
plt.close()
|
189 |
|
190 |
|
|
|
231 |
plt.legend(loc="upper left")
|
232 |
plt.tight_layout()
|
233 |
plt.grid(GRID)
|
234 |
+
plt.savefig(f"{output_dir}/datasets_time_series.png")
|
235 |
plt.close()
|
236 |
|
237 |
|
|
|
275 |
plt.legend(loc="upper left")
|
276 |
plt.tight_layout()
|
277 |
plt.grid(GRID)
|
278 |
+
plt.savefig(f"{output_dir}/datasets_stack_area.png")
|
279 |
plt.close()
|
280 |
|
281 |
# Plot stacked area for all except English
|
|
|
299 |
plt.legend(loc="upper left")
|
300 |
plt.tight_layout()
|
301 |
plt.grid(GRID)
|
302 |
+
plt.savefig(f"{output_dir}/datasets_stack_area_es_ca_gl_eu.png")
|
303 |
plt.close()
|
304 |
|
305 |
# Plot stacked area for English and Spanish
|
|
|
317 |
plt.legend(loc="upper left")
|
318 |
plt.tight_layout()
|
319 |
plt.grid(GRID)
|
320 |
+
plt.savefig(f"{output_dir}/datasets_stack_area_en_es.png")
|
321 |
plt.close()
|
322 |
|
323 |
# Plot stacked area for Spanish only
|
|
|
335 |
plt.legend(loc="upper left")
|
336 |
plt.tight_layout()
|
337 |
plt.grid(GRID)
|
338 |
+
plt.savefig(f"{output_dir}/datasets_stack_area_es.png")
|
339 |
plt.close()
|
340 |
|
341 |
|
hub_models_by_language.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pickle
|
3 |
+
from datetime import datetime
|
4 |
+
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import pandas as pd
|
7 |
+
from huggingface_hub import HfApi
|
8 |
+
|
9 |
+
# Define colors for each language
|
10 |
+
LANGUAGE_COLORS = {
|
11 |
+
"english": "orange",
|
12 |
+
"spanish": "blue",
|
13 |
+
}
|
14 |
+
|
15 |
+
|
16 |
+
def fetch_models(cache_file="models_cache.pkl"):
|
17 |
+
"""Fetch and filter models from HuggingFace Hub with caching"""
|
18 |
+
# Check if cached data exists and is less than 24 hours old
|
19 |
+
if os.path.exists(cache_file):
|
20 |
+
cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
|
21 |
+
if cache_age < 24 * 3600: # 24 hours in seconds
|
22 |
+
print("Loading models from cache...")
|
23 |
+
with open(cache_file, "rb") as f:
|
24 |
+
return pickle.load(f)
|
25 |
+
else:
|
26 |
+
print("Cache is older than 24 hours, fetching fresh data...")
|
27 |
+
else:
|
28 |
+
print("No cache found, fetching models from Hugging Face Hub...")
|
29 |
+
|
30 |
+
hf_api = HfApi()
|
31 |
+
all_models = list(hf_api.list_models(full=True))
|
32 |
+
|
33 |
+
# Filter models by language
|
34 |
+
english_filter = filter(
|
35 |
+
lambda m: any(tag == "language:en" for tag in m.tags)
|
36 |
+
and not any(
|
37 |
+
tag.startswith("language:") and tag != "language:en" for tag in m.tags
|
38 |
+
),
|
39 |
+
all_models,
|
40 |
+
)
|
41 |
+
spanish_filter = filter(
|
42 |
+
lambda m: any(tag == "language:es" for tag in m.tags)
|
43 |
+
and not any(
|
44 |
+
tag.startswith("language:") and tag != "language:es" for tag in m.tags
|
45 |
+
),
|
46 |
+
all_models,
|
47 |
+
)
|
48 |
+
|
49 |
+
filtered_models = {
|
50 |
+
"english": list(english_filter),
|
51 |
+
"spanish": list(spanish_filter),
|
52 |
+
}
|
53 |
+
|
54 |
+
# Cache the filtered models
|
55 |
+
print("Saving models to cache...")
|
56 |
+
with open(cache_file, "wb") as f:
|
57 |
+
pickle.dump(filtered_models, f)
|
58 |
+
|
59 |
+
return filtered_models
|
60 |
+
|
61 |
+
|
62 |
+
def create_stack_area_plot(models, output_dir):
|
63 |
+
"""Create stacked area plot for English and Spanish models"""
|
64 |
+
# Prepare data for all languages
|
65 |
+
all_dates = []
|
66 |
+
languages = ["english", "spanish"]
|
67 |
+
for lang in languages:
|
68 |
+
all_dates.extend([d.created_at.date() for d in models[lang]])
|
69 |
+
|
70 |
+
if not all_dates:
|
71 |
+
print("No models found for any language. Skipping plot creation.")
|
72 |
+
return
|
73 |
+
|
74 |
+
# Create a common date range for all languages
|
75 |
+
min_date = min(all_dates)
|
76 |
+
max_date = max(all_dates)
|
77 |
+
date_range = pd.date_range(start=min_date, end=max_date, freq="MS")
|
78 |
+
|
79 |
+
# Create separate DataFrames for each language
|
80 |
+
dfs = {}
|
81 |
+
for lang in languages:
|
82 |
+
dates = [d.created_at.date() for d in models[lang]]
|
83 |
+
df = pd.DataFrame({"Date": dates})
|
84 |
+
df["Count"] = 1
|
85 |
+
df["Date"] = pd.to_datetime(df["Date"])
|
86 |
+
# Reindex to common date range and fill missing values with 0
|
87 |
+
df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum()
|
88 |
+
df_grouped = df_grouped.reindex(date_range, fill_value=0)
|
89 |
+
dfs[lang] = df_grouped.cumsum()
|
90 |
+
|
91 |
+
# Plot stacked area for English and Spanish
|
92 |
+
plt.figure(figsize=(10, 6))
|
93 |
+
plt.stackplot(
|
94 |
+
date_range,
|
95 |
+
[dfs[lang]["Count"].values for lang in languages],
|
96 |
+
labels=["English", "Spanish"],
|
97 |
+
colors=[LANGUAGE_COLORS[lang] for lang in languages],
|
98 |
+
)
|
99 |
+
|
100 |
+
plt.xlabel("Date", fontsize=10)
|
101 |
+
plt.ylabel("Cumulative Number of Models", fontsize=10)
|
102 |
+
plt.xticks(rotation=45, fontsize=10)
|
103 |
+
plt.legend(loc="upper left")
|
104 |
+
plt.tight_layout()
|
105 |
+
plt.savefig(f"{output_dir}/models_stack_area_en_es.png")
|
106 |
+
plt.close()
|
107 |
+
|
108 |
+
|
109 |
+
def main():
|
110 |
+
# Create output directory if it doesn't exist
|
111 |
+
output_dir = "plots"
|
112 |
+
os.makedirs(output_dir, exist_ok=True)
|
113 |
+
|
114 |
+
# Fetch models
|
115 |
+
print("Fetching models from Hugging Face Hub...")
|
116 |
+
models = fetch_models()
|
117 |
+
|
118 |
+
# Print model counts
|
119 |
+
print("\nModel counts:")
|
120 |
+
for lang, models_list in models.items():
|
121 |
+
print(f"{lang.capitalize()}: {len(models_list)}")
|
122 |
+
|
123 |
+
# Create visualization
|
124 |
+
print("\nCreating stack area plot...")
|
125 |
+
create_stack_area_plot(models, output_dir)
|
126 |
+
|
127 |
+
print(f"Plot has been saved to the '{output_dir}' directory")
|
128 |
+
|
129 |
+
|
130 |
+
if __name__ == "__main__":
|
131 |
+
main()
|
plots/{bar_plot_horizontal.png β datasets_bar_plot_horizontal.png}
RENAMED
File without changes
|
plots/{bar_plot_vertical.png β datasets_bar_plot_vertical.png}
RENAMED
File without changes
|
plots/{pie_chart.png β datasets_pie_chart.png}
RENAMED
File without changes
|
plots/{stack_area.png β datasets_stack_area.png}
RENAMED
File without changes
|
plots/{stack_area_en_es.png β datasets_stack_area_en_es.png}
RENAMED
File without changes
|
plots/{stack_area_es.png β datasets_stack_area_es.png}
RENAMED
File without changes
|
plots/{stack_area_es_ca_gl_eu.png β datasets_stack_area_es_ca_gl_eu.png}
RENAMED
File without changes
|
plots/{time_series.png β datasets_time_series.png}
RENAMED
File without changes
|