mariagrandury commited on
Commit
a938b8a
Β·
1 Parent(s): 2a275ae

specify resource type in plot names

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
app.py CHANGED
@@ -3,17 +3,24 @@ import subprocess
3
  import gradio as gr
4
 
5
 
6
- def run_script():
7
  try:
8
- result = subprocess.run(
 
9
  ["python", "hub_datasets_by_language.py"],
10
  capture_output=True,
11
  text=True,
12
  check=True,
13
  )
14
- return "Script executed successfully! Plots have been updated."
 
 
 
 
 
 
15
  except subprocess.CalledProcessError as e:
16
- return f"Failed to execute script: {str(e.stderr)}"
17
 
18
 
19
  def create_app():
@@ -30,42 +37,57 @@ def create_app():
30
  """
31
  ## English vs Spanish Monolingual Datasets
32
 
33
- Note: We consider only **monolingual** datasets in these plots, i.e. datasets that only contain data in one language. This is because *most* of the multilingual datasets are usually machine-translated and we want to focus on original data.
34
  """
35
  )
36
 
37
  with gr.Row():
38
  with gr.Column():
39
  gr.Image(
40
- value="plots/bar_plot_horizontal.png",
41
- label="Distribution by Year (Horizontal)",
42
  show_label=True,
43
  show_download_button=True,
44
  show_share_button=True,
45
  )
46
  gr.Image(
47
- value="plots/stack_area_en_es.png",
48
- label="Stacked Area Plot",
49
  show_label=True,
50
  show_download_button=True,
51
  show_share_button=True,
52
  )
53
  with gr.Column():
54
  gr.Image(
55
- value="plots/bar_plot_vertical.png",
56
- label="Distribution by Year (Vertical)",
57
  show_label=True,
58
  show_download_button=True,
59
  show_share_button=True,
60
  )
61
  gr.Image(
62
- value="plots/time_series.png",
63
- label="Cumulative Growth Over Time",
64
  show_label=True,
65
  show_download_button=True,
66
  show_share_button=True,
67
  )
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  with gr.Row():
70
  update_button = gr.Button("Update Plots with Latest Data")
71
  output_label = gr.Label()
@@ -74,7 +96,7 @@ def create_app():
74
  """
75
  ## Adapt to other languages
76
 
77
- This Space is WIP and more languages and visuals will be included shortly. Meanwhile, you can clone the Space, adapt the code in the script and run it to generate plots for other languages.
78
  """
79
  )
80
 
@@ -97,7 +119,7 @@ def create_app():
97
  )
98
 
99
  update_button.click(
100
- fn=run_script,
101
  outputs=output_label,
102
  )
103
 
 
3
  import gradio as gr
4
 
5
 
6
+ def run_scripts():
7
  try:
8
+ # Execute both scripts
9
+ subprocess.run(
10
  ["python", "hub_datasets_by_language.py"],
11
  capture_output=True,
12
  text=True,
13
  check=True,
14
  )
15
+ # subprocess.run(
16
+ # ["python", "hub_models_by_language.py"],
17
+ # capture_output=True,
18
+ # text=True,
19
+ # check=True,
20
+ # )
21
+ return "Scripts executed successfully! All plots have been updated."
22
  except subprocess.CalledProcessError as e:
23
+ return f"Failed to execute scripts: {str(e.stderr)}"
24
 
25
 
26
  def create_app():
 
37
  """
38
  ## English vs Spanish Monolingual Datasets
39
 
40
+ Note: We consider only **monolingual** resources in these plots, i.e. datasets and models that only contain data in one language. This is because *most* of the multilingual resources are usually machine-translated and we want to focus on original data.
41
  """
42
  )
43
 
44
  with gr.Row():
45
  with gr.Column():
46
  gr.Image(
47
+ value="plots/datasets_bar_plot_horizontal.png",
48
+ label="Distribution of Datasets by Year (Horizontal)",
49
  show_label=True,
50
  show_download_button=True,
51
  show_share_button=True,
52
  )
53
  gr.Image(
54
+ value="plots/datasets_stack_area_en_es.png",
55
+ label="Cumulative Growth of Datasets (Stacked)",
56
  show_label=True,
57
  show_download_button=True,
58
  show_share_button=True,
59
  )
60
  with gr.Column():
61
  gr.Image(
62
+ value="plots/datasets_bar_plot_vertical.png",
63
+ label="Distribution of Datasets by Year (Vertical)",
64
  show_label=True,
65
  show_download_button=True,
66
  show_share_button=True,
67
  )
68
  gr.Image(
69
+ value="plots/datasets_time_series.png",
70
+ label="Cumulative Growth of Datasets (Line)",
71
  show_label=True,
72
  show_download_button=True,
73
  show_share_button=True,
74
  )
75
 
76
+ # gr.Markdown(
77
+ # """
78
+ # ## English vs Spanish Models
79
+ # """
80
+ # )
81
+
82
+ # with gr.Row():
83
+ # gr.Image(
84
+ # value="plots/models_stack_area_en_es.png",
85
+ # label="Cumulative Growth of Models",
86
+ # show_label=True,
87
+ # show_download_button=True,
88
+ # show_share_button=True,
89
+ # )
90
+
91
  with gr.Row():
92
  update_button = gr.Button("Update Plots with Latest Data")
93
  output_label = gr.Label()
 
96
  """
97
  ## Adapt to other languages
98
 
99
+ This Space is WIP and more languages and visuals will be included shortly. Meanwhile, you can clone the Space, adapt the code in the scripts and run it to generate plots for other languages.
100
  """
101
  )
102
 
 
119
  )
120
 
121
  update_button.click(
122
+ fn=run_scripts,
123
  outputs=output_label,
124
  )
125
 
hub_datasets_by_language.py CHANGED
@@ -133,7 +133,7 @@ def create_bar_plots(datasets, output_dir):
133
  plt.legend()
134
  plt.grid(GRID)
135
  plt.tight_layout()
136
- plt.savefig(f"{output_dir}/bar_plot_horizontal.png")
137
  plt.close()
138
 
139
  # Vertical bar plot
@@ -160,7 +160,7 @@ def create_bar_plots(datasets, output_dir):
160
  plt.legend()
161
  plt.tight_layout()
162
  plt.grid(GRID)
163
- plt.savefig(f"{output_dir}/bar_plot_vertical.png")
164
  plt.close()
165
 
166
 
@@ -184,7 +184,7 @@ def create_pie_chart(datasets, output_dir):
184
  ],
185
  )
186
  plt.axis("equal")
187
- plt.savefig(f"{output_dir}/pie_chart.png")
188
  plt.close()
189
 
190
 
@@ -231,7 +231,7 @@ def create_time_series(datasets, output_dir):
231
  plt.legend(loc="upper left")
232
  plt.tight_layout()
233
  plt.grid(GRID)
234
- plt.savefig(f"{output_dir}/time_series.png")
235
  plt.close()
236
 
237
 
@@ -275,7 +275,7 @@ def create_stack_area_plots(datasets, output_dir):
275
  plt.legend(loc="upper left")
276
  plt.tight_layout()
277
  plt.grid(GRID)
278
- plt.savefig(f"{output_dir}/stack_area.png")
279
  plt.close()
280
 
281
  # Plot stacked area for all except English
@@ -299,7 +299,7 @@ def create_stack_area_plots(datasets, output_dir):
299
  plt.legend(loc="upper left")
300
  plt.tight_layout()
301
  plt.grid(GRID)
302
- plt.savefig(f"{output_dir}/stack_area_es_ca_gl_eu.png")
303
  plt.close()
304
 
305
  # Plot stacked area for English and Spanish
@@ -317,7 +317,7 @@ def create_stack_area_plots(datasets, output_dir):
317
  plt.legend(loc="upper left")
318
  plt.tight_layout()
319
  plt.grid(GRID)
320
- plt.savefig(f"{output_dir}/stack_area_en_es.png")
321
  plt.close()
322
 
323
  # Plot stacked area for Spanish only
@@ -335,7 +335,7 @@ def create_stack_area_plots(datasets, output_dir):
335
  plt.legend(loc="upper left")
336
  plt.tight_layout()
337
  plt.grid(GRID)
338
- plt.savefig(f"{output_dir}/stack_area_es.png")
339
  plt.close()
340
 
341
 
 
133
  plt.legend()
134
  plt.grid(GRID)
135
  plt.tight_layout()
136
+ plt.savefig(f"{output_dir}/datasets_bar_plot_horizontal.png")
137
  plt.close()
138
 
139
  # Vertical bar plot
 
160
  plt.legend()
161
  plt.tight_layout()
162
  plt.grid(GRID)
163
+ plt.savefig(f"{output_dir}/datasets_bar_plot_vertical.png")
164
  plt.close()
165
 
166
 
 
184
  ],
185
  )
186
  plt.axis("equal")
187
+ plt.savefig(f"{output_dir}/datasets_pie_chart.png")
188
  plt.close()
189
 
190
 
 
231
  plt.legend(loc="upper left")
232
  plt.tight_layout()
233
  plt.grid(GRID)
234
+ plt.savefig(f"{output_dir}/datasets_time_series.png")
235
  plt.close()
236
 
237
 
 
275
  plt.legend(loc="upper left")
276
  plt.tight_layout()
277
  plt.grid(GRID)
278
+ plt.savefig(f"{output_dir}/datasets_stack_area.png")
279
  plt.close()
280
 
281
  # Plot stacked area for all except English
 
299
  plt.legend(loc="upper left")
300
  plt.tight_layout()
301
  plt.grid(GRID)
302
+ plt.savefig(f"{output_dir}/datasets_stack_area_es_ca_gl_eu.png")
303
  plt.close()
304
 
305
  # Plot stacked area for English and Spanish
 
317
  plt.legend(loc="upper left")
318
  plt.tight_layout()
319
  plt.grid(GRID)
320
+ plt.savefig(f"{output_dir}/datasets_stack_area_en_es.png")
321
  plt.close()
322
 
323
  # Plot stacked area for Spanish only
 
335
  plt.legend(loc="upper left")
336
  plt.tight_layout()
337
  plt.grid(GRID)
338
+ plt.savefig(f"{output_dir}/datasets_stack_area_es.png")
339
  plt.close()
340
 
341
 
hub_models_by_language.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ from datetime import datetime
4
+
5
+ import matplotlib.pyplot as plt
6
+ import pandas as pd
7
+ from huggingface_hub import HfApi
8
+
9
+ # Define colors for each language
10
+ LANGUAGE_COLORS = {
11
+ "english": "orange",
12
+ "spanish": "blue",
13
+ }
14
+
15
+
16
+ def fetch_models(cache_file="models_cache.pkl"):
17
+ """Fetch and filter models from HuggingFace Hub with caching"""
18
+ # Check if cached data exists and is less than 24 hours old
19
+ if os.path.exists(cache_file):
20
+ cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
21
+ if cache_age < 24 * 3600: # 24 hours in seconds
22
+ print("Loading models from cache...")
23
+ with open(cache_file, "rb") as f:
24
+ return pickle.load(f)
25
+ else:
26
+ print("Cache is older than 24 hours, fetching fresh data...")
27
+ else:
28
+ print("No cache found, fetching models from Hugging Face Hub...")
29
+
30
+ hf_api = HfApi()
31
+ all_models = list(hf_api.list_models(full=True))
32
+
33
+ # Filter models by language
34
+ english_filter = filter(
35
+ lambda m: any(tag == "language:en" for tag in m.tags)
36
+ and not any(
37
+ tag.startswith("language:") and tag != "language:en" for tag in m.tags
38
+ ),
39
+ all_models,
40
+ )
41
+ spanish_filter = filter(
42
+ lambda m: any(tag == "language:es" for tag in m.tags)
43
+ and not any(
44
+ tag.startswith("language:") and tag != "language:es" for tag in m.tags
45
+ ),
46
+ all_models,
47
+ )
48
+
49
+ filtered_models = {
50
+ "english": list(english_filter),
51
+ "spanish": list(spanish_filter),
52
+ }
53
+
54
+ # Cache the filtered models
55
+ print("Saving models to cache...")
56
+ with open(cache_file, "wb") as f:
57
+ pickle.dump(filtered_models, f)
58
+
59
+ return filtered_models
60
+
61
+
62
+ def create_stack_area_plot(models, output_dir):
63
+ """Create stacked area plot for English and Spanish models"""
64
+ # Prepare data for all languages
65
+ all_dates = []
66
+ languages = ["english", "spanish"]
67
+ for lang in languages:
68
+ all_dates.extend([d.created_at.date() for d in models[lang]])
69
+
70
+ if not all_dates:
71
+ print("No models found for any language. Skipping plot creation.")
72
+ return
73
+
74
+ # Create a common date range for all languages
75
+ min_date = min(all_dates)
76
+ max_date = max(all_dates)
77
+ date_range = pd.date_range(start=min_date, end=max_date, freq="MS")
78
+
79
+ # Create separate DataFrames for each language
80
+ dfs = {}
81
+ for lang in languages:
82
+ dates = [d.created_at.date() for d in models[lang]]
83
+ df = pd.DataFrame({"Date": dates})
84
+ df["Count"] = 1
85
+ df["Date"] = pd.to_datetime(df["Date"])
86
+ # Reindex to common date range and fill missing values with 0
87
+ df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum()
88
+ df_grouped = df_grouped.reindex(date_range, fill_value=0)
89
+ dfs[lang] = df_grouped.cumsum()
90
+
91
+ # Plot stacked area for English and Spanish
92
+ plt.figure(figsize=(10, 6))
93
+ plt.stackplot(
94
+ date_range,
95
+ [dfs[lang]["Count"].values for lang in languages],
96
+ labels=["English", "Spanish"],
97
+ colors=[LANGUAGE_COLORS[lang] for lang in languages],
98
+ )
99
+
100
+ plt.xlabel("Date", fontsize=10)
101
+ plt.ylabel("Cumulative Number of Models", fontsize=10)
102
+ plt.xticks(rotation=45, fontsize=10)
103
+ plt.legend(loc="upper left")
104
+ plt.tight_layout()
105
+ plt.savefig(f"{output_dir}/models_stack_area_en_es.png")
106
+ plt.close()
107
+
108
+
109
+ def main():
110
+ # Create output directory if it doesn't exist
111
+ output_dir = "plots"
112
+ os.makedirs(output_dir, exist_ok=True)
113
+
114
+ # Fetch models
115
+ print("Fetching models from Hugging Face Hub...")
116
+ models = fetch_models()
117
+
118
+ # Print model counts
119
+ print("\nModel counts:")
120
+ for lang, models_list in models.items():
121
+ print(f"{lang.capitalize()}: {len(models_list)}")
122
+
123
+ # Create visualization
124
+ print("\nCreating stack area plot...")
125
+ create_stack_area_plot(models, output_dir)
126
+
127
+ print(f"Plot has been saved to the '{output_dir}' directory")
128
+
129
+
130
+ if __name__ == "__main__":
131
+ main()
plots/{bar_plot_horizontal.png β†’ datasets_bar_plot_horizontal.png} RENAMED
File without changes
plots/{bar_plot_vertical.png β†’ datasets_bar_plot_vertical.png} RENAMED
File without changes
plots/{pie_chart.png β†’ datasets_pie_chart.png} RENAMED
File without changes
plots/{stack_area.png β†’ datasets_stack_area.png} RENAMED
File without changes
plots/{stack_area_en_es.png β†’ datasets_stack_area_en_es.png} RENAMED
File without changes
plots/{stack_area_es.png β†’ datasets_stack_area_es.png} RENAMED
File without changes
plots/{stack_area_es_ca_gl_eu.png β†’ datasets_stack_area_es_ca_gl_eu.png} RENAMED
File without changes
plots/{time_series.png β†’ datasets_time_series.png} RENAMED
File without changes