add-hub-search

#3
by burtenshaw HF Staff - opened
Files changed (4) hide show
  1. app.py +12 -3
  2. pyproject.toml +1 -0
  3. requirements.txt +87 -8
  4. uv.lock +14 -0
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  from datasets import load_dataset, Dataset
3
  from difflib import ndiff
4
  import pandas as pd
 
5
 
6
  from semhash import SemHash
7
  from semhash.datamodels import DeduplicationResult
@@ -323,7 +324,12 @@ with gr.Blocks(
323
  )
324
 
325
  with gr.Row():
326
- dataset1_name = gr.Textbox(value=default_dataset_name, label="Dataset 1 Name")
 
 
 
 
 
327
  dataset1_split = gr.Textbox(
328
  value=default_dataset1_split, label="Dataset 1 Split"
329
  )
@@ -334,8 +340,11 @@ with gr.Blocks(
334
  dataset2_inputs = gr.Column(visible=True)
335
  with dataset2_inputs:
336
  with gr.Row():
337
- dataset2_name = gr.Textbox(
338
- value=default_dataset_name, label="Dataset 2 Name"
 
 
 
339
  )
340
  dataset2_split = gr.Textbox(
341
  value=default_dataset2_split, label="Dataset 2 Split"
 
2
  from datasets import load_dataset, Dataset
3
  from difflib import ndiff
4
  import pandas as pd
5
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
6
 
7
  from semhash import SemHash
8
  from semhash.datamodels import DeduplicationResult
 
324
  )
325
 
326
  with gr.Row():
327
+ dataset1_name = HuggingfaceHubSearch(
328
+ label="Dataset 1 Name",
329
+ placeholder="Search for datasets on HuggingFace Hub",
330
+ search_type="dataset",
331
+ value=default_dataset_name,
332
+ )
333
  dataset1_split = gr.Textbox(
334
  value=default_dataset1_split, label="Dataset 1 Split"
335
  )
 
340
  dataset2_inputs = gr.Column(visible=True)
341
  with dataset2_inputs:
342
  with gr.Row():
343
+ dataset2_name = HuggingfaceHubSearch(
344
+ label="Dataset 2 Name",
345
+ placeholder="Search for datasets on HuggingFace Hub",
346
+ search_type="dataset",
347
+ value=default_dataset_name,
348
  )
349
  dataset2_split = gr.Textbox(
350
  value=default_dataset2_split, label="Dataset 2 Split"
pyproject.toml CHANGED
@@ -6,6 +6,7 @@ readme = "README.md"
6
  requires-python = ">=3.11"
7
  dependencies = [
8
  "datasets>=3.6.0",
 
9
  "gradio[oauth]>=5.32.1",
10
  "huggingface-hub>=0.32.3",
11
  "model2vec>=0.5.0",
 
6
  requires-python = ">=3.11"
7
  dependencies = [
8
  "datasets>=3.6.0",
9
+ "gradio-huggingfacehub-search>=0.0.12",
10
  "gradio[oauth]>=5.32.1",
11
  "huggingface-hub>=0.32.3",
12
  "model2vec>=0.5.0",
requirements.txt CHANGED
@@ -1,8 +1,87 @@
1
- gradio
2
- datasets
3
- semhash
4
- model2vec
5
- huggingface_hub
6
- numpy
7
- tqdm
8
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv export --format requirements-txt --no-hashes
3
+ aiofiles==24.1.0
4
+ aiohappyeyeballs==2.6.1
5
+ aiohttp==3.12.7
6
+ aiosignal==1.3.2
7
+ annotated-types==0.7.0
8
+ anyio==4.9.0
9
+ attrs==25.3.0
10
+ audioop-lts==0.2.1 ; python_full_version >= '3.13'
11
+ authlib==1.6.0
12
+ certifi==2025.4.26
13
+ cffi==1.17.1 ; platform_python_implementation != 'PyPy'
14
+ charset-normalizer==3.4.2
15
+ click==8.2.1 ; sys_platform != 'emscripten'
16
+ colorama==0.4.6 ; platform_system == 'Windows'
17
+ cryptography==45.0.3
18
+ datasets==3.6.0
19
+ dill==0.3.8
20
+ fastapi==0.115.12
21
+ ffmpy==0.6.0
22
+ filelock==3.18.0
23
+ frozendict==2.4.6
24
+ frozenlist==1.6.0
25
+ fsspec==2025.3.0
26
+ gradio==5.32.1
27
+ gradio-client==1.10.2
28
+ gradio-huggingfacehub-search==0.0.12
29
+ groovy==0.1.2
30
+ h11==0.16.0
31
+ hf-xet==1.1.2 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
32
+ httpcore==1.0.9
33
+ httpx==0.28.1
34
+ huggingface-hub==0.32.3
35
+ idna==3.10
36
+ itsdangerous==2.2.0
37
+ jinja2==3.1.6
38
+ joblib==1.5.1
39
+ markdown-it-py==3.0.0
40
+ markupsafe==3.0.2
41
+ mdurl==0.1.2
42
+ model2vec==0.5.0
43
+ multidict==6.4.4
44
+ multiprocess==0.70.16
45
+ numpy==2.2.6
46
+ orjson==3.10.18
47
+ packaging==25.0
48
+ pandas==2.2.3
49
+ pillow==11.2.1
50
+ propcache==0.3.1
51
+ pyarrow==20.0.0
52
+ pycparser==2.22 ; platform_python_implementation != 'PyPy'
53
+ pydantic==2.11.5
54
+ pydantic-core==2.33.2
55
+ pydub==0.25.1
56
+ pygments==2.19.1
57
+ python-dateutil==2.9.0.post0
58
+ python-multipart==0.0.20
59
+ pytz==2025.2
60
+ pyyaml==6.0.2
61
+ requests==2.32.3
62
+ rich==14.0.0
63
+ ruff==0.11.12 ; sys_platform != 'emscripten'
64
+ safehttpx==0.1.6
65
+ safetensors==0.5.3
66
+ semantic-version==2.10.0
67
+ semhash==0.3.0
68
+ setuptools==80.9.0
69
+ shellingham==1.5.4 ; sys_platform != 'emscripten'
70
+ simsimd==6.4.7
71
+ six==1.17.0
72
+ sniffio==1.3.1
73
+ starlette==0.46.2
74
+ tokenizers==0.21.1
75
+ tomlkit==0.13.2
76
+ tqdm==4.67.1
77
+ typer==0.16.0 ; sys_platform != 'emscripten'
78
+ typing-extensions==4.14.0
79
+ typing-inspection==0.4.1
80
+ tzdata==2025.2
81
+ urllib3==2.4.0
82
+ usearch==2.17.8
83
+ uvicorn==0.34.3 ; sys_platform != 'emscripten'
84
+ vicinity==0.4.1
85
+ websockets==15.0.1
86
+ xxhash==3.5.0
87
+ yarl==1.20.0
uv.lock CHANGED
@@ -582,6 +582,18 @@ wheels = [
582
  { url = "https://files.pythonhosted.org/packages/9b/1b/b372308c263379ae3ebc440512432979458330113bdee26cef86c89bf48e/gradio_client-1.10.2-py3-none-any.whl", hash = "sha256:6de67b6224123d264c7887caa0586b2a9e2c369ec32ca38927cf8a841694edcd", size = 323311 },
583
  ]
584
 
 
 
 
 
 
 
 
 
 
 
 
 
585
  [[package]]
586
  name = "groovy"
587
  version = "0.1.2"
@@ -1475,6 +1487,7 @@ source = { virtual = "." }
1475
  dependencies = [
1476
  { name = "datasets" },
1477
  { name = "gradio", extra = ["oauth"] },
 
1478
  { name = "huggingface-hub" },
1479
  { name = "model2vec" },
1480
  { name = "numpy" },
@@ -1486,6 +1499,7 @@ dependencies = [
1486
  requires-dist = [
1487
  { name = "datasets", specifier = ">=3.6.0" },
1488
  { name = "gradio", extras = ["oauth"], specifier = ">=5.32.1" },
 
1489
  { name = "huggingface-hub", specifier = ">=0.32.3" },
1490
  { name = "model2vec", specifier = ">=0.5.0" },
1491
  { name = "numpy", specifier = ">=2.2.6" },
 
582
  { url = "https://files.pythonhosted.org/packages/9b/1b/b372308c263379ae3ebc440512432979458330113bdee26cef86c89bf48e/gradio_client-1.10.2-py3-none-any.whl", hash = "sha256:6de67b6224123d264c7887caa0586b2a9e2c369ec32ca38927cf8a841694edcd", size = 323311 },
583
  ]
584
 
585
+ [[package]]
586
+ name = "gradio-huggingfacehub-search"
587
+ version = "0.0.12"
588
+ source = { registry = "https://pypi.org/simple" }
589
+ dependencies = [
590
+ { name = "gradio" },
591
+ ]
592
+ sdist = { url = "https://files.pythonhosted.org/packages/c2/aa/dbe96e59a31fbeacce39eb0a7828109dc3b8d0005a967f30acb48f3395b2/gradio_huggingfacehub_search-0.0.12.tar.gz", hash = "sha256:9da6b7b7c97f2ff8fcfad8d5789a68c79e995e0d39163bccfded422135df7288", size = 1199884 }
593
+ wheels = [
594
+ { url = "https://files.pythonhosted.org/packages/fc/f6/5aeb2b22a28e7961c2b396e4847c82c634cf139c1a80be697b824ec5e525/gradio_huggingfacehub_search-0.0.12-py3-none-any.whl", hash = "sha256:41d8870fa6b2b715848fcadb5e773eab4da0b9fe431c463a7cce2d616ad8f743", size = 1128435 },
595
+ ]
596
+
597
  [[package]]
598
  name = "groovy"
599
  version = "0.1.2"
 
1487
  dependencies = [
1488
  { name = "datasets" },
1489
  { name = "gradio", extra = ["oauth"] },
1490
+ { name = "gradio-huggingfacehub-search" },
1491
  { name = "huggingface-hub" },
1492
  { name = "model2vec" },
1493
  { name = "numpy" },
 
1499
  requires-dist = [
1500
  { name = "datasets", specifier = ">=3.6.0" },
1501
  { name = "gradio", extras = ["oauth"], specifier = ">=5.32.1" },
1502
+ { name = "gradio-huggingfacehub-search", specifier = ">=0.0.12" },
1503
  { name = "huggingface-hub", specifier = ">=0.32.3" },
1504
  { name = "model2vec", specifier = ">=0.5.0" },
1505
  { name = "numpy", specifier = ">=2.2.6" },