burtenshaw HF Staff commited on
Commit
547d83c
·
1 Parent(s): 2b02c75

add create dataset script

Browse files
Files changed (3) hide show
  1. create_argilla_dataset.py +88 -0
  2. requirements.txt +3 -0
  3. template.jinja +12 -0
create_argilla_dataset.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import urllib.parse
3
+ from http import client
4
+
5
+ import argilla as rg
6
+ import requests
7
+ from jinja2 import Template
8
+
9
+
10
+ def generate_html(image_urls, app_url):
11
+ """none"""
12
+ template_str = """
13
+ <html>
14
+ <body>
15
+ <iframe src="{{ full_url }}" width="400" height="400" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
16
+ </body>
17
+ </html>
18
+ """
19
+ base_url = f"{app_url}/?"
20
+ encoded_urls = [
21
+ "img{}={}".format(i + 1, urllib.parse.quote(url, safe=""))
22
+ for i, url in enumerate(image_urls)
23
+ ]
24
+ full_url = base_url + "&".join(encoded_urls)
25
+
26
+ template = Template(template_str)
27
+ return template.render(full_url=full_url)
28
+
29
+
30
+ def fetch_data(max_images, name="horse"):
31
+ base_url = "https://datasets-server.huggingface.co/rows"
32
+ params = {
33
+ "dataset": "gigant/horse2zebra",
34
+ "config": name,
35
+ "split": "train",
36
+ "offset": 0,
37
+ "length": max_images,
38
+ }
39
+
40
+ response = requests.get(base_url, params=params)
41
+
42
+ if response.status_code == 200:
43
+ data = response.json()
44
+ return [row["row"]["image"]["src"] for row in data["rows"]]
45
+ else:
46
+ print(f"Failed to fetch data. Status code: {response.status_code}")
47
+ return None
48
+
49
+
50
+ def log_records(dataset, app_url, max_images=10):
51
+ horse_urls = fetch_data(max_images=max_images, name="horse")
52
+ zebra_urls = fetch_data(max_images=max_images, name="zebra")
53
+
54
+ records = []
55
+ for horse_url, zebra_url in zip(horse_urls, zebra_urls):
56
+ markdown_str = generate_html(image_urls=[horse_url, zebra_url], app_url=app_url)
57
+ record = rg.Record(fields={"gradio_app": markdown_str})
58
+ records.append(record)
59
+
60
+ dataset.records.log(records)
61
+
62
+
63
+ def create_dataset(api_url="http://localhost:6900", api_key="owner.apikey"):
64
+ client = rg.Argilla(api_url=api_url, api_key=api_key)
65
+
66
+ # Create a dataset
67
+ settings = rg.Settings(
68
+ fields=[rg.TextField(name="gradio_app")],
69
+ questions=[
70
+ rg.LabelQuestion(
71
+ name="is_zebra",
72
+ description="Is this a zebra?",
73
+ labels=["true", "false"],
74
+ )
75
+ ],
76
+ )
77
+ dataset = rg.Dataset(
78
+ name=f"horse2zebra_{random.randint(0,100)}", settings=settings, client=client
79
+ )
80
+ dataset.create()
81
+ return dataset
82
+
83
+
84
+ if __name__ == "__main__":
85
+ dataset = create_dataset(api_url="https://burtenshaw-gradio-field.hf.space")
86
+ log_records(
87
+ dataset, max_images=10, app_url="https://burtenshaw-gradio-field-app.hf.space"
88
+ )
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Jinja2
2
+ datasets
3
+ gradio
template.jinja ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% set base_url = "http://127.0.0.1:7860/?" %}
2
+ {% set encoded_urls = [] %}
3
+
4
+ {% for i, url in enumerate(image_urls) %}
5
+ {% set encoded_urls = encoded_urls + [ "img" ~ loop.index ~ "=" ~ url | urlencode ] %}
6
+ {% endfor %}
7
+
8
+ {% set full_url = base_url + encoded_urls | join("&") %}
9
+
10
+ # Image Gallery
11
+
12
+ <iframe src="{{ full_url }}" width="1000" height="1000" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>