Spaces:
Configuration error
Configuration error
Commit
·
4401dfb
1
Parent(s):
700526e
feat: Update app for long audio captioning and chaining
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +89 -1
- app.py +86 -9
- requirements.txt +2 -0
- src/sonicverse/multi_token.egg-info/SOURCES.txt +0 -6
- src/sonicverse/multi_token.egg-info/requires.txt +0 -8
- src/sonicverse/requirements.txt +166 -7
- src/sonicverse/scripts/clap_gpt_build_finetune_dataset.py +1 -1
- src/sonicverse/scripts/clap_gpt_build_pretrain_dataset.py +1 -1
- src/sonicverse/scripts/document_build_finetune_dataset.py +2 -2
- src/sonicverse/scripts/document_build_pretrain_dataset.py +2 -2
- src/sonicverse/scripts/evaluate_model.py +4 -4
- src/sonicverse/scripts/evaluate_model_latest.py +4 -4
- src/sonicverse/scripts/evaluate_model_mullama.py +4 -4
- src/sonicverse/scripts/evaluate_model_mullama_musiccaps.py +4 -4
- src/sonicverse/scripts/evaluate_model_mullama_musiccaps_fixed_prompt.py +4 -4
- src/sonicverse/scripts/evaluate_mullama.py +4 -4
- src/sonicverse/scripts/evaluate_temp.py +4 -4
- src/sonicverse/scripts/gym_lunar_lander_build_dataset.py +1 -1
- src/sonicverse/scripts/gym_lunar_lander_client.py +1 -1
- src/sonicverse/scripts/imagebind_build_llava_finetune_dataset.py +1 -1
- src/sonicverse/scripts/imagebind_build_llava_pretrain_dataset.py +1 -1
- src/sonicverse/scripts/llava_build_finetune_dataset.py +1 -1
- src/sonicverse/scripts/llava_build_pretrain_dataset.py +1 -1
- src/sonicverse/scripts/llava_gpt_build_multi_image_finetune_dataset.py +1 -1
- src/sonicverse/scripts/serve_model.py +4 -4
- src/sonicverse/scripts/serve_model_gradio.py +4 -4
- src/sonicverse/scripts/train_model.py +5 -5
- src/sonicverse/scripts/whisper_build_pretrain_dataset.py +1 -1
- src/sonicverse/scripts/whisper_gpt_build_finetune_dataset.py +1 -1
- src/sonicverse/scripts/xclip_build_finetune_dataset.py +1 -1
- src/sonicverse/scripts/xclip_build_pretrain_dataset.py +1 -1
- src/sonicverse/setup.py +4 -4
- src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/PKG-INFO +4 -4
- src/sonicverse/sonicverse.egg-info/SOURCES.txt +6 -0
- src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/dependency_links.txt +0 -0
- src/sonicverse/sonicverse.egg-info/requires.txt +167 -0
- src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/top_level.txt +0 -0
- src/sonicverse/{multi_token → sonicverse}/constants.py +0 -0
- src/sonicverse/{multi_token → sonicverse}/data_tools.py +1 -1
- src/sonicverse/{multi_token → sonicverse}/inference.py +5 -5
- src/sonicverse/{multi_token → sonicverse}/language_models/__init__.py +1 -1
- src/sonicverse/{multi_token → sonicverse}/language_models/base_model.py +2 -2
- src/sonicverse/{multi_token → sonicverse}/language_models/mistral.py +1 -1
- src/sonicverse/{multi_token → sonicverse}/modalities/__init__.py +9 -9
- src/sonicverse/{multi_token → sonicverse}/modalities/audio_clap.py +4 -4
- src/sonicverse/{multi_token → sonicverse}/modalities/audio_descript.py +4 -4
- src/sonicverse/{multi_token → sonicverse}/modalities/audio_descript_bu.py +3 -3
- src/sonicverse/{multi_token → sonicverse}/modalities/audio_mert.py +5 -5
- src/sonicverse/{multi_token → sonicverse}/modalities/audio_mert_bu.py +4 -4
- src/sonicverse/{multi_token → sonicverse}/modalities/audio_whisper.py +3 -3
README.md
CHANGED
@@ -9,4 +9,92 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
# 🎼 SonicVerse
|
13 |
+
|
14 |
+
An interactive demo for SonicVerse, a music captioning model, allowing users to input audio of up to 10 seconds and generate a natural language caption
|
15 |
+
that includes a general description of the music as well as music features such as key, instruments, genre, mood / theme, vocals gender.
|
16 |
+
|
17 |
+
---
|
18 |
+
|
19 |
+
## 🚀 Demo
|
20 |
+
|
21 |
+
Check out the live Space here:
|
22 |
+
[](https://huggingface.co/spaces/annabeth97c/SonicVerse)
|
23 |
+
|
24 |
+
---
|
25 |
+
|
26 |
+
## 🚀 Samples
|
27 |
+
|
28 |
+
Short captions
|
29 |
+
|
30 |
+
---
|
31 |
+
|
32 |
+
## 📦 Features
|
33 |
+
|
34 |
+
✅ Upload a 10 second music clip and get a caption
|
35 |
+
|
36 |
+
✅ Upload a long music clip (upto 1 minute for successful demo) to get a long detailed caption for the whole music clip.
|
37 |
+
|
38 |
+
---
|
39 |
+
|
40 |
+
## 🛠️ How to Run Locally
|
41 |
+
|
42 |
+
```bash
|
43 |
+
# Clone the repo
|
44 |
+
git clone https://github.com/AMAAI-Lab/SonicVerse
|
45 |
+
cd SonicVerse
|
46 |
+
|
47 |
+
# Install dependencies
|
48 |
+
pip install -r requirements.txt
|
49 |
+
|
50 |
+
# Alternatively, set up conda environment
|
51 |
+
conda env create -f environment.yml
|
52 |
+
conda activate sonicverse
|
53 |
+
|
54 |
+
# Run the app
|
55 |
+
python app.py
|
56 |
+
```
|
57 |
+
|
58 |
+
---
|
59 |
+
|
60 |
+
<!-- ## 📂 File Structure
|
61 |
+
|
62 |
+
```
|
63 |
+
.
|
64 |
+
├── app.py # Web app file
|
65 |
+
├── requirements.txt # Python dependencies
|
66 |
+
├── environment.yml # Conda environment
|
67 |
+
├── README.md # This file
|
68 |
+
└── src/sonicverse # Source
|
69 |
+
```
|
70 |
+
|
71 |
+
--- -->
|
72 |
+
|
73 |
+
## 💡 Usage
|
74 |
+
|
75 |
+
To use the app:
|
76 |
+
1. Select audio clip to input
|
77 |
+
2. Click the **Generate** button.
|
78 |
+
3. See the model’s output below.
|
79 |
+
|
80 |
+
---
|
81 |
+
|
82 |
+
## 🧹 Built With
|
83 |
+
|
84 |
+
- [Hugging Face Spaces](https://huggingface.co/spaces)
|
85 |
+
- [Gradio](https://gradio.app/)
|
86 |
+
- [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
87 |
+
- [MERT 95M](https://huggingface.co/m-a-p/MERT-v1-95M)
|
88 |
+
---
|
89 |
+
|
90 |
+
<!-- ## ✨ Acknowledgements
|
91 |
+
|
92 |
+
- [Model authors or papers you built on]
|
93 |
+
- [Contributors or collaborators]
|
94 |
+
|
95 |
+
---
|
96 |
+
|
97 |
+
## 📜 License
|
98 |
+
|
99 |
+
This project is licensed under the MIT License / Apache 2.0 / Other.
|
100 |
+
-->
|
app.py
CHANGED
@@ -18,11 +18,17 @@ import torch
|
|
18 |
import transformers
|
19 |
import torchaudio
|
20 |
|
21 |
-
from
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
@dataclass
|
28 |
class ServeArguments(ModelArguments):
|
@@ -31,7 +37,6 @@ class ServeArguments(ModelArguments):
|
|
31 |
temperature: float = field(default=0.01)
|
32 |
|
33 |
|
34 |
-
# Load arguments and model
|
35 |
logging.getLogger().setLevel(logging.INFO)
|
36 |
|
37 |
parser = transformers.HfArgumentParser((ServeArguments,))
|
@@ -45,10 +50,82 @@ model, tokenizer = load_trained_lora_model(
|
|
45 |
tasks_config=serve_args.tasks_config
|
46 |
)
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
|
|
52 |
req_json = {
|
53 |
"messages": [
|
54 |
{"role": "user", "content": "Describe the music. <sound>"}
|
@@ -79,7 +156,7 @@ def generate_caption(audio_file):
|
|
79 |
|
80 |
|
81 |
demo = gr.Interface(
|
82 |
-
fn=
|
83 |
inputs=gr.Audio(type="filepath", label="Upload an audio file"),
|
84 |
outputs=gr.Textbox(label="Generated Caption"),
|
85 |
title="SonicVerse",
|
|
|
18 |
import transformers
|
19 |
import torchaudio
|
20 |
|
21 |
+
from openai import OpenAI
|
22 |
+
client = OpenAI()
|
23 |
+
MODEL = "gpt-4"
|
24 |
+
SLEEP_BETWEEN_CALLS = 1.0
|
25 |
|
26 |
+
from sonicverse.model_utils import MultiTaskType
|
27 |
+
from sonicverse.training import ModelArguments
|
28 |
+
from sonicverse.inference import load_trained_lora_model
|
29 |
+
from sonicverse.data_tools import encode_chat
|
30 |
+
|
31 |
+
CHUNK_LENGTH = 10
|
32 |
|
33 |
@dataclass
|
34 |
class ServeArguments(ModelArguments):
|
|
|
37 |
temperature: float = field(default=0.01)
|
38 |
|
39 |
|
|
|
40 |
logging.getLogger().setLevel(logging.INFO)
|
41 |
|
42 |
parser = transformers.HfArgumentParser((ServeArguments,))
|
|
|
50 |
tasks_config=serve_args.tasks_config
|
51 |
)
|
52 |
|
53 |
+
def caption_audio(audio_file):
|
54 |
+
chunk_audio_files = split_audio(audio_file, CHUNK_LENGTH)
|
55 |
+
chunk_captions = []
|
56 |
+
for audio_chunk in chunk_audio_files:
|
57 |
+
chunk_captions.append(generate_caption(audio_chunk))
|
58 |
+
|
59 |
+
if len(chunk_captions) > 1:
|
60 |
+
audio_name = os.path.splitext(os.path.basename(audio_file))[0]
|
61 |
+
long_caption = summarize_song(audio_name, chunk_captions)
|
62 |
+
|
63 |
+
delete_files(chunk_audio_files)
|
64 |
+
|
65 |
+
return long_caption
|
66 |
+
|
67 |
+
else:
|
68 |
+
if len(chunk_captions) == 1:
|
69 |
+
return chunk_captions[0]
|
70 |
+
else:
|
71 |
+
return ""
|
72 |
+
|
73 |
+
def summarize_song(song_name, chunks):
|
74 |
+
prompt = f"""
|
75 |
+
You are a music critic. Given the following chronological 10‑second chunk descriptions of a single piece, write one flowing, detailed description of the entire song—its structure, instrumentation, and standout moments. Mention transition points in terms of time stamps. If the description of certain chunks does not seem to fit with those for the chunks before and after, treat those as bad descriptions with lower accuracy and do not incorporate the information. Retain concrete musical attributes such as key, chords, tempo.
|
76 |
+
|
77 |
+
Chunks for “{song_name} ”:
|
78 |
+
"""
|
79 |
+
for i, c in enumerate(chunks, 1):
|
80 |
+
prompt += f"\n {(i - 1)*0} to {i*10} seconds. {c.strip()}"
|
81 |
+
prompt += "\n\nFull song description:"
|
82 |
+
|
83 |
+
resp = client.chat.completions.create(model=MODEL,
|
84 |
+
messages=[
|
85 |
+
{"role": "system", "content": "You are an expert music writer."},
|
86 |
+
{"role": "user", "content": prompt}
|
87 |
+
],
|
88 |
+
temperature=0.0,
|
89 |
+
max_tokens=1000)
|
90 |
+
return resp.choices[0].message.content.strip()
|
91 |
+
|
92 |
+
def delete_files(file_paths):
|
93 |
+
for path in file_paths:
|
94 |
+
try:
|
95 |
+
if os.path.isfile(path):
|
96 |
+
os.remove(path)
|
97 |
+
print(f"Deleted: {path}")
|
98 |
+
else:
|
99 |
+
print(f"Skipped (not a file or doesn't exist): {path}")
|
100 |
+
except Exception as e:
|
101 |
+
print(f"Error deleting {path}: {e}")
|
102 |
+
|
103 |
+
def split_audio(input_path, chunk_length_seconds):
|
104 |
+
|
105 |
+
waveform, sample_rate = torchaudio.load(input_path)
|
106 |
+
num_channels, total_samples = waveform.shape
|
107 |
+
chunk_samples = int(chunk_length_seconds * sample_rate)
|
108 |
+
|
109 |
+
num_chunks = (total_samples + chunk_samples - 1) // chunk_samples
|
110 |
+
|
111 |
+
base, ext = os.path.splitext(input_path)
|
112 |
+
output_paths = []
|
113 |
+
|
114 |
+
if (num_chunks <= 1):
|
115 |
+
return [input_path]
|
116 |
+
|
117 |
+
for i in range(num_chunks):
|
118 |
+
start = i * chunk_samples
|
119 |
+
end = min((i + 1) * chunk_samples, total_samples)
|
120 |
+
chunk_waveform = waveform[:, start:end]
|
121 |
+
|
122 |
+
output_file = f"{base}_{i+1:03d}{ext}"
|
123 |
+
torchaudio.save(output_file, chunk_waveform, sample_rate)
|
124 |
+
output_paths.append(output_file)
|
125 |
+
|
126 |
+
return output_paths
|
127 |
|
128 |
+
def generate_caption(audio_file):
|
129 |
req_json = {
|
130 |
"messages": [
|
131 |
{"role": "user", "content": "Describe the music. <sound>"}
|
|
|
156 |
|
157 |
|
158 |
demo = gr.Interface(
|
159 |
+
fn=caption_audio,
|
160 |
inputs=gr.Audio(type="filepath", label="Upload an audio file"),
|
161 |
outputs=gr.Textbox(label="Generated Caption"),
|
162 |
title="SonicVerse",
|
requirements.txt
CHANGED
@@ -74,6 +74,7 @@ mdurl==0.1.2
|
|
74 |
mpmath==1.3.0
|
75 |
msgpack==1.0.8
|
76 |
multidict==6.0.5
|
|
|
77 |
multiprocess==0.70.16
|
78 |
narwhals==1.40.0
|
79 |
networkx==3.2.1
|
@@ -93,6 +94,7 @@ nvidia-cusparse-cu12==12.1.0.106
|
|
93 |
nvidia-nccl-cu12==2.20.5
|
94 |
nvidia-nvjitlink-cu12==12.5.82
|
95 |
nvidia-nvtx-cu12==12.1.105
|
|
|
96 |
orjson==3.10.18
|
97 |
packaging==24.1
|
98 |
pandas==2.2.2
|
|
|
74 |
mpmath==1.3.0
|
75 |
msgpack==1.0.8
|
76 |
multidict==6.0.5
|
77 |
+
git+https://huggingface.co/spaces/annabeth97c/temp#egg=multi_token&subdirectory=src/sonicverse
|
78 |
multiprocess==0.70.16
|
79 |
narwhals==1.40.0
|
80 |
networkx==3.2.1
|
|
|
94 |
nvidia-nccl-cu12==2.20.5
|
95 |
nvidia-nvjitlink-cu12==12.5.82
|
96 |
nvidia-nvtx-cu12==12.1.105
|
97 |
+
openai==1.82.0
|
98 |
orjson==3.10.18
|
99 |
packaging==24.1
|
100 |
pandas==2.2.2
|
src/sonicverse/multi_token.egg-info/SOURCES.txt
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
setup.py
|
2 |
-
multi_token.egg-info/PKG-INFO
|
3 |
-
multi_token.egg-info/SOURCES.txt
|
4 |
-
multi_token.egg-info/dependency_links.txt
|
5 |
-
multi_token.egg-info/requires.txt
|
6 |
-
multi_token.egg-info/top_level.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/sonicverse/multi_token.egg-info/requires.txt
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
transformers>=4.34.0
|
2 |
-
accelerate>=0.21.0
|
3 |
-
scipy>=1.11.3
|
4 |
-
bitsandbytes>=0.41.0
|
5 |
-
datasets>=2.14.5
|
6 |
-
sentencepiece>=0.1.99
|
7 |
-
peft>=0.4.0
|
8 |
-
deepspeed==0.9.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/sonicverse/requirements.txt
CHANGED
@@ -1,8 +1,167 @@
|
|
1 |
-
|
2 |
-
accelerate
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
deepspeed==0.9.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.29.3
|
3 |
+
aiofiles==23.2.1
|
4 |
+
aiohttp==3.9.5
|
5 |
+
aiosignal==1.3.1
|
6 |
+
altair==5.5.0
|
7 |
+
anyio==4.9.0
|
8 |
+
argbind==0.3.9
|
9 |
+
asttokens==2.4.1
|
10 |
+
async-timeout==4.0.3
|
11 |
+
attrs==23.2.0
|
12 |
+
audioread==3.0.1
|
13 |
+
bert-score==0.3.13
|
14 |
+
bitsandbytes==0.43.1
|
15 |
+
blinker==1.8.2
|
16 |
+
certifi==2024.7.4
|
17 |
+
cffi==1.16.0
|
18 |
+
charset-normalizer==3.3.2
|
19 |
+
click==8.1.7
|
20 |
+
contourpy==1.2.1
|
21 |
+
cycler==0.12.1
|
22 |
+
datasets==2.19.0
|
23 |
+
decorator==5.1.1
|
24 |
deepspeed==0.9.5
|
25 |
+
descript-audio-codec==1.0.0
|
26 |
+
descript-audiotools==0.7.2
|
27 |
+
dill==0.3.8
|
28 |
+
docstring_parser==0.16
|
29 |
+
einops==0.8.0
|
30 |
+
evaluate==0.4.3
|
31 |
+
exceptiongroup==1.2.2
|
32 |
+
executing==2.0.1
|
33 |
+
fastapi==0.115.12
|
34 |
+
ffmpy==0.3.2
|
35 |
+
filelock==3.15.4
|
36 |
+
fire==0.6.0
|
37 |
+
Flask==3.0.3
|
38 |
+
flatten-dict==0.4.2
|
39 |
+
fonttools==4.53.1
|
40 |
+
frozenlist==1.4.1
|
41 |
+
fsspec==2024.3.1
|
42 |
+
future==1.0.0
|
43 |
+
gradio==3.50.2
|
44 |
+
gradio_client==0.6.1
|
45 |
+
graphviz==0.20.3
|
46 |
+
grpcio==1.64.1
|
47 |
+
h11==0.16.0
|
48 |
+
hjson==3.1.0
|
49 |
+
httpcore==1.0.9
|
50 |
+
httpx==0.28.1
|
51 |
+
huggingface-hub==0.23.4
|
52 |
+
idna==3.7
|
53 |
+
importlib_metadata==8.0.0
|
54 |
+
importlib_resources==6.4.0
|
55 |
+
ipython==8.18.1
|
56 |
+
itsdangerous==2.2.0
|
57 |
+
jedi==0.19.1
|
58 |
+
Jinja2==3.1.4
|
59 |
+
joblib==1.4.2
|
60 |
+
jsonschema==4.23.0
|
61 |
+
jsonschema-specifications==2025.4.1
|
62 |
+
julius==0.2.7
|
63 |
+
kiwisolver==1.4.5
|
64 |
+
lazy_loader==0.4
|
65 |
+
librosa==0.10.2.post1
|
66 |
+
llvmlite==0.43.0
|
67 |
+
Markdown==3.6
|
68 |
+
markdown-it-py==3.0.0
|
69 |
+
markdown2==2.5.0
|
70 |
+
MarkupSafe==2.1.5
|
71 |
+
matplotlib==3.9.1
|
72 |
+
matplotlib-inline==0.1.7
|
73 |
+
mdurl==0.1.2
|
74 |
+
mpmath==1.3.0
|
75 |
+
msgpack==1.0.8
|
76 |
+
multidict==6.0.5
|
77 |
+
git+https://huggingface.co/spaces/annabeth97c/temp#egg=multi_token&subdirectory=src/sonicverse
|
78 |
+
multiprocess==0.70.16
|
79 |
+
narwhals==1.40.0
|
80 |
+
networkx==3.2.1
|
81 |
+
ninja==1.11.1.1
|
82 |
+
nltk==3.8.1
|
83 |
+
numba==0.60.0
|
84 |
+
numpy==1.26.4
|
85 |
+
nvidia-cublas-cu12==12.1.3.1
|
86 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
87 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
88 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
89 |
+
nvidia-cudnn-cu12==8.9.2.26
|
90 |
+
nvidia-cufft-cu12==11.0.2.54
|
91 |
+
nvidia-curand-cu12==10.3.2.106
|
92 |
+
nvidia-cusolver-cu12==11.4.5.107
|
93 |
+
nvidia-cusparse-cu12==12.1.0.106
|
94 |
+
nvidia-nccl-cu12==2.20.5
|
95 |
+
nvidia-nvjitlink-cu12==12.5.82
|
96 |
+
nvidia-nvtx-cu12==12.1.105
|
97 |
+
openai==1.82.0
|
98 |
+
orjson==3.10.18
|
99 |
+
packaging==24.1
|
100 |
+
pandas==2.2.2
|
101 |
+
parso==0.8.4
|
102 |
+
peft==0.10.0
|
103 |
+
pexpect==4.9.0
|
104 |
+
pillow==10.3.0
|
105 |
+
platformdirs==4.2.2
|
106 |
+
pooch==1.8.2
|
107 |
+
prompt_toolkit==3.0.47
|
108 |
+
protobuf==3.19.6
|
109 |
+
psutil==6.0.0
|
110 |
+
ptyprocess==0.7.0
|
111 |
+
pure-eval==0.2.2
|
112 |
+
py-cpuinfo==9.0.0
|
113 |
+
pyarrow==16.1.0
|
114 |
+
pyarrow-hotfix==0.6
|
115 |
+
pycparser==2.22
|
116 |
+
pydantic==1.10.17
|
117 |
+
pydub==0.25.1
|
118 |
+
Pygments==2.18.0
|
119 |
+
pyloudnorm==0.1.1
|
120 |
+
pyparsing==3.1.2
|
121 |
+
pystoi==0.4.1
|
122 |
+
python-dateutil==2.9.0.post0
|
123 |
+
python-multipart==0.0.20
|
124 |
+
pytz==2024.1
|
125 |
+
PyYAML==6.0.1
|
126 |
+
randomname==0.2.1
|
127 |
+
referencing==0.36.2
|
128 |
+
regex==2024.5.15
|
129 |
+
requests==2.32.3
|
130 |
+
rich==13.7.1
|
131 |
+
rouge_score==0.1.2
|
132 |
+
rpds-py==0.25.0
|
133 |
+
safetensors==0.4.3
|
134 |
+
scikit-learn==1.5.1
|
135 |
+
scipy==1.13.0
|
136 |
+
semantic-version==2.10.0
|
137 |
+
sentencepiece==0.2.0
|
138 |
+
six==1.16.0
|
139 |
+
sniffio==1.3.1
|
140 |
+
soundfile==0.12.1
|
141 |
+
soxr==0.3.7
|
142 |
+
stack-data==0.6.3
|
143 |
+
starlette==0.46.2
|
144 |
+
sympy==1.13.0
|
145 |
+
tensorboard==2.17.0
|
146 |
+
tensorboard-data-server==0.7.2
|
147 |
+
termcolor==2.4.0
|
148 |
+
threadpoolctl==3.5.0
|
149 |
+
tokenizers==0.19.1
|
150 |
+
torch==2.3.1
|
151 |
+
torch-stoi==0.2.1
|
152 |
+
torchaudio==2.3.1
|
153 |
+
torchviz==0.0.2
|
154 |
+
tqdm==4.66.4
|
155 |
+
traitlets==5.14.3
|
156 |
+
transformers==4.40.1
|
157 |
+
triton==2.3.1
|
158 |
+
typing_extensions==4.12.2
|
159 |
+
tzdata==2024.1
|
160 |
+
urllib3==2.2.2
|
161 |
+
uvicorn==0.34.2
|
162 |
+
wcwidth==0.2.13
|
163 |
+
websockets==11.0.3
|
164 |
+
Werkzeug==3.0.3
|
165 |
+
xxhash==3.4.1
|
166 |
+
yarl==1.9.4
|
167 |
+
zipp==3.19.2
|
src/sonicverse/scripts/clap_gpt_build_finetune_dataset.py
CHANGED
@@ -7,7 +7,7 @@ import openai
|
|
7 |
|
8 |
from datasets import Dataset, load_dataset
|
9 |
|
10 |
-
from
|
11 |
|
12 |
PROMPT = """
|
13 |
You are helping train a sound assistant that can take audio inputs and output text.
|
|
|
7 |
|
8 |
from datasets import Dataset, load_dataset
|
9 |
|
10 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
11 |
|
12 |
PROMPT = """
|
13 |
You are helping train a sound assistant that can take audio inputs and output text.
|
src/sonicverse/scripts/clap_gpt_build_pretrain_dataset.py
CHANGED
@@ -7,7 +7,7 @@ import openai
|
|
7 |
|
8 |
from datasets import Dataset, load_dataset
|
9 |
|
10 |
-
from
|
11 |
|
12 |
PROMPT = """
|
13 |
You are helping write captions for audio clips.
|
|
|
7 |
|
8 |
from datasets import Dataset, load_dataset
|
9 |
|
10 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
11 |
|
12 |
PROMPT = """
|
13 |
You are helping write captions for audio clips.
|
src/sonicverse/scripts/document_build_finetune_dataset.py
CHANGED
@@ -7,8 +7,8 @@ import json
|
|
7 |
from datasets import load_dataset
|
8 |
from datasets import Dataset
|
9 |
|
10 |
-
from
|
11 |
-
from
|
12 |
split_text_into_documents,
|
13 |
)
|
14 |
|
|
|
7 |
from datasets import load_dataset
|
8 |
from datasets import Dataset
|
9 |
|
10 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
11 |
+
from sonicverse.modalities.document_gte import (
|
12 |
split_text_into_documents,
|
13 |
)
|
14 |
|
src/sonicverse/scripts/document_build_pretrain_dataset.py
CHANGED
@@ -5,8 +5,8 @@ import argparse
|
|
5 |
from datasets import load_dataset
|
6 |
from datasets import Dataset
|
7 |
|
8 |
-
from
|
9 |
-
from
|
10 |
split_text_into_documents,
|
11 |
)
|
12 |
|
|
|
5 |
from datasets import load_dataset
|
6 |
from datasets import Dataset
|
7 |
|
8 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
9 |
+
from sonicverse.modalities.document_gte import (
|
10 |
split_text_into_documents,
|
11 |
)
|
12 |
|
src/sonicverse/scripts/evaluate_model.py
CHANGED
@@ -7,12 +7,12 @@ import torch
|
|
7 |
|
8 |
from datasets import load_from_disk
|
9 |
|
10 |
-
from
|
11 |
-
from
|
12 |
ModelArguments,
|
13 |
)
|
14 |
-
from
|
15 |
-
from
|
16 |
|
17 |
import evaluate
|
18 |
|
|
|
7 |
|
8 |
from datasets import load_from_disk
|
9 |
|
10 |
+
from sonicverse.model_utils import MultiTaskType
|
11 |
+
from sonicverse.training import (
|
12 |
ModelArguments,
|
13 |
)
|
14 |
+
from sonicverse.inference import load_trained_lora_model
|
15 |
+
from sonicverse.data_tools import encode_chat
|
16 |
|
17 |
import evaluate
|
18 |
|
src/sonicverse/scripts/evaluate_model_latest.py
CHANGED
@@ -5,10 +5,10 @@ from flask import Flask, request, jsonify
|
|
5 |
import transformers
|
6 |
import torch
|
7 |
from datasets import load_from_disk
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
-
from
|
12 |
import evaluate
|
13 |
import random
|
14 |
import bert_score
|
|
|
5 |
import transformers
|
6 |
import torch
|
7 |
from datasets import load_from_disk
|
8 |
+
from sonicverse.model_utils import MultiTaskType
|
9 |
+
from sonicverse.training import ModelArguments
|
10 |
+
from sonicverse.inference import load_trained_lora_model
|
11 |
+
from sonicverse.data_tools import encode_chat
|
12 |
import evaluate
|
13 |
import random
|
14 |
import bert_score
|
src/sonicverse/scripts/evaluate_model_mullama.py
CHANGED
@@ -5,10 +5,10 @@ from flask import Flask, request, jsonify
|
|
5 |
import transformers
|
6 |
import torch
|
7 |
from datasets import load_from_disk
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
-
from
|
12 |
import evaluate
|
13 |
import random
|
14 |
import bert_score
|
|
|
5 |
import transformers
|
6 |
import torch
|
7 |
from datasets import load_from_disk
|
8 |
+
from sonicverse.model_utils import MultiTaskType
|
9 |
+
from sonicverse.training import ModelArguments
|
10 |
+
from sonicverse.inference import load_trained_lora_model
|
11 |
+
from sonicverse.data_tools import encode_chat
|
12 |
import evaluate
|
13 |
import random
|
14 |
import bert_score
|
src/sonicverse/scripts/evaluate_model_mullama_musiccaps.py
CHANGED
@@ -5,10 +5,10 @@ from flask import Flask, request, jsonify
|
|
5 |
import transformers
|
6 |
import torch
|
7 |
from datasets import load_from_disk
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
-
from
|
12 |
import evaluate
|
13 |
import random
|
14 |
import bert_score
|
|
|
5 |
import transformers
|
6 |
import torch
|
7 |
from datasets import load_from_disk
|
8 |
+
from sonicverse.model_utils import MultiTaskType
|
9 |
+
from sonicverse.training import ModelArguments
|
10 |
+
from sonicverse.inference import load_trained_lora_model
|
11 |
+
from sonicverse.data_tools import encode_chat
|
12 |
import evaluate
|
13 |
import random
|
14 |
import bert_score
|
src/sonicverse/scripts/evaluate_model_mullama_musiccaps_fixed_prompt.py
CHANGED
@@ -4,10 +4,10 @@ from flask import Flask, request, jsonify
|
|
4 |
import transformers
|
5 |
import torch
|
6 |
from datasets import load_from_disk
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
import evaluate
|
12 |
import random
|
13 |
import bert_score
|
|
|
4 |
import transformers
|
5 |
import torch
|
6 |
from datasets import load_from_disk
|
7 |
+
from sonicverse.model_utils import MultiTaskType
|
8 |
+
from sonicverse.training import ModelArguments
|
9 |
+
from sonicverse.inference import load_trained_lora_model
|
10 |
+
from sonicverse.data_tools import encode_chat
|
11 |
import evaluate
|
12 |
import random
|
13 |
import bert_score
|
src/sonicverse/scripts/evaluate_mullama.py
CHANGED
@@ -4,10 +4,10 @@ from flask import Flask, request, jsonify
|
|
4 |
import transformers
|
5 |
import torch
|
6 |
from datasets import load_from_disk
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
import evaluate
|
12 |
import random
|
13 |
import bert_score
|
|
|
4 |
import transformers
|
5 |
import torch
|
6 |
from datasets import load_from_disk
|
7 |
+
from sonicverse.model_utils import MultiTaskType
|
8 |
+
from sonicverse.training import ModelArguments
|
9 |
+
from sonicverse.inference import load_trained_lora_model
|
10 |
+
from sonicverse.data_tools import encode_chat
|
11 |
import evaluate
|
12 |
import random
|
13 |
import bert_score
|
src/sonicverse/scripts/evaluate_temp.py
CHANGED
@@ -4,10 +4,10 @@ from flask import Flask, request, jsonify
|
|
4 |
import transformers
|
5 |
import torch
|
6 |
from datasets import load_from_disk
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
import evaluate
|
12 |
import random
|
13 |
import bert_score
|
|
|
4 |
import transformers
|
5 |
import torch
|
6 |
from datasets import load_from_disk
|
7 |
+
from sonicverse.model_utils import MultiTaskType
|
8 |
+
from sonicverse.training import ModelArguments
|
9 |
+
from sonicverse.inference import load_trained_lora_model
|
10 |
+
from sonicverse.data_tools import encode_chat
|
11 |
import evaluate
|
12 |
import random
|
13 |
import bert_score
|
src/sonicverse/scripts/gym_lunar_lander_build_dataset.py
CHANGED
@@ -12,7 +12,7 @@ import torch.nn as nn
|
|
12 |
import numpy as np
|
13 |
import torch
|
14 |
|
15 |
-
from
|
16 |
|
17 |
LUNAR_LANDER_OPTIONS = (
|
18 |
"[FIRE LEFT ENGINE], [FIRE RIGHT ENGINE], [FIRE MAIN ENGINE], [NOTHING]".split(", ")
|
|
|
12 |
import numpy as np
|
13 |
import torch
|
14 |
|
15 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
16 |
|
17 |
LUNAR_LANDER_OPTIONS = (
|
18 |
"[FIRE LEFT ENGINE], [FIRE RIGHT ENGINE], [FIRE MAIN ENGINE], [NOTHING]".split(", ")
|
src/sonicverse/scripts/gym_lunar_lander_client.py
CHANGED
@@ -6,7 +6,7 @@ import os
|
|
6 |
from PIL import Image
|
7 |
import gymnasium as gym
|
8 |
|
9 |
-
from
|
10 |
|
11 |
LUNAR_LANDER_OPTIONS = (
|
12 |
"[FIRE LEFT ENGINE], [FIRE RIGHT ENGINE], [FIRE MAIN ENGINE], [NOTHING]".split(", ")
|
|
|
6 |
from PIL import Image
|
7 |
import gymnasium as gym
|
8 |
|
9 |
+
from sonicverse.constants import ROLE_USER
|
10 |
|
11 |
LUNAR_LANDER_OPTIONS = (
|
12 |
"[FIRE LEFT ENGINE], [FIRE RIGHT ENGINE], [FIRE MAIN ENGINE], [NOTHING]".split(", ")
|
src/sonicverse/scripts/imagebind_build_llava_finetune_dataset.py
CHANGED
@@ -6,7 +6,7 @@ import os
|
|
6 |
|
7 |
from datasets import Dataset
|
8 |
|
9 |
-
from
|
10 |
|
11 |
|
12 |
TYPES = ["audio", "image", "text"]
|
|
|
6 |
|
7 |
from datasets import Dataset
|
8 |
|
9 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
10 |
|
11 |
|
12 |
TYPES = ["audio", "image", "text"]
|
src/sonicverse/scripts/imagebind_build_llava_pretrain_dataset.py
CHANGED
@@ -6,7 +6,7 @@ import os
|
|
6 |
|
7 |
from datasets import Dataset
|
8 |
|
9 |
-
from
|
10 |
|
11 |
|
12 |
TYPES = ["audio", "image", "text"]
|
|
|
6 |
|
7 |
from datasets import Dataset
|
8 |
|
9 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
10 |
|
11 |
|
12 |
TYPES = ["audio", "image", "text"]
|
src/sonicverse/scripts/llava_build_finetune_dataset.py
CHANGED
@@ -5,7 +5,7 @@ import os
|
|
5 |
|
6 |
from datasets import Dataset
|
7 |
|
8 |
-
from
|
9 |
|
10 |
|
11 |
def _convert_convo(convo) -> List:
|
|
|
5 |
|
6 |
from datasets import Dataset
|
7 |
|
8 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
9 |
|
10 |
|
11 |
def _convert_convo(convo) -> List:
|
src/sonicverse/scripts/llava_build_pretrain_dataset.py
CHANGED
@@ -5,7 +5,7 @@ import os
|
|
5 |
|
6 |
from datasets import Dataset
|
7 |
|
8 |
-
from
|
9 |
|
10 |
|
11 |
def _convert_convo(convo) -> List:
|
|
|
5 |
|
6 |
from datasets import Dataset
|
7 |
|
8 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
9 |
|
10 |
|
11 |
def _convert_convo(convo) -> List:
|
src/sonicverse/scripts/llava_gpt_build_multi_image_finetune_dataset.py
CHANGED
@@ -7,7 +7,7 @@ import openai
|
|
7 |
|
8 |
from datasets import Dataset, load_dataset
|
9 |
|
10 |
-
from
|
11 |
|
12 |
PROMPT = """
|
13 |
You are helping train a chat vision assistant that can take several image inputs and output text.
|
|
|
7 |
|
8 |
from datasets import Dataset, load_dataset
|
9 |
|
10 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
11 |
|
12 |
PROMPT = """
|
13 |
You are helping train a chat vision assistant that can take several image inputs and output text.
|
src/sonicverse/scripts/serve_model.py
CHANGED
@@ -5,12 +5,12 @@ from flask import Flask, request, jsonify
|
|
5 |
import transformers
|
6 |
import torch
|
7 |
|
8 |
-
from
|
9 |
-
from
|
10 |
ModelArguments,
|
11 |
)
|
12 |
-
from
|
13 |
-
from
|
14 |
|
15 |
|
16 |
@dataclass
|
|
|
5 |
import transformers
|
6 |
import torch
|
7 |
|
8 |
+
from sonicverse.model_utils import MultiTaskType
|
9 |
+
from sonicverse.training import (
|
10 |
ModelArguments,
|
11 |
)
|
12 |
+
from sonicverse.inference import load_trained_lora_model
|
13 |
+
from sonicverse.data_tools import encode_chat
|
14 |
|
15 |
|
16 |
@dataclass
|
src/sonicverse/scripts/serve_model_gradio.py
CHANGED
@@ -6,10 +6,10 @@ import torch
|
|
6 |
import transformers
|
7 |
import torchaudio
|
8 |
|
9 |
-
from
|
10 |
-
from
|
11 |
-
from
|
12 |
-
from
|
13 |
|
14 |
|
15 |
@dataclass
|
|
|
6 |
import transformers
|
7 |
import torchaudio
|
8 |
|
9 |
+
from sonicverse.model_utils import MultiTaskType
|
10 |
+
from sonicverse.training import ModelArguments
|
11 |
+
from sonicverse.inference import load_trained_lora_model
|
12 |
+
from sonicverse.data_tools import encode_chat
|
13 |
|
14 |
|
15 |
@dataclass
|
src/sonicverse/scripts/train_model.py
CHANGED
@@ -1,20 +1,20 @@
|
|
1 |
import transformers
|
2 |
import logging
|
3 |
|
4 |
-
from
|
5 |
TrainingArguments,
|
6 |
ModelArguments,
|
7 |
train_for_modalities,
|
8 |
)
|
9 |
-
from
|
10 |
DataArguments,
|
11 |
TrainDataArguments,
|
12 |
EvaluationDataArguments,
|
13 |
)
|
14 |
|
15 |
-
from
|
16 |
-
from
|
17 |
-
from
|
18 |
|
19 |
if __name__ == "__main__":
|
20 |
logging.getLogger().setLevel(logging.INFO)
|
|
|
1 |
import transformers
|
2 |
import logging
|
3 |
|
4 |
+
from sonicverse.training import (
|
5 |
TrainingArguments,
|
6 |
ModelArguments,
|
7 |
train_for_modalities,
|
8 |
)
|
9 |
+
from sonicverse.training_data import (
|
10 |
DataArguments,
|
11 |
TrainDataArguments,
|
12 |
EvaluationDataArguments,
|
13 |
)
|
14 |
|
15 |
+
from sonicverse.model_utils import MultiTaskType
|
16 |
+
from sonicverse.language_models import LANGUAGE_MODEL_NAME_TO_CLASS
|
17 |
+
from sonicverse.modalities import MODALITY_BUILDERS
|
18 |
|
19 |
if __name__ == "__main__":
|
20 |
logging.getLogger().setLevel(logging.INFO)
|
src/sonicverse/scripts/whisper_build_pretrain_dataset.py
CHANGED
@@ -5,7 +5,7 @@ import argparse
|
|
5 |
from datasets import load_dataset
|
6 |
from datasets import Dataset
|
7 |
|
8 |
-
from
|
9 |
|
10 |
DATASET_ARGS = dict(
|
11 |
path="mozilla-foundation/common_voice_15_0", name="en", split="train"
|
|
|
5 |
from datasets import load_dataset
|
6 |
from datasets import Dataset
|
7 |
|
8 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
9 |
|
10 |
DATASET_ARGS = dict(
|
11 |
path="mozilla-foundation/common_voice_15_0", name="en", split="train"
|
src/sonicverse/scripts/whisper_gpt_build_finetune_dataset.py
CHANGED
@@ -7,7 +7,7 @@ import openai
|
|
7 |
|
8 |
from datasets import Dataset, load_dataset
|
9 |
|
10 |
-
from
|
11 |
|
12 |
DATASET_ARGS = dict(
|
13 |
path="mozilla-foundation/common_voice_15_0", name="en", split="train"
|
|
|
7 |
|
8 |
from datasets import Dataset, load_dataset
|
9 |
|
10 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
11 |
|
12 |
DATASET_ARGS = dict(
|
13 |
path="mozilla-foundation/common_voice_15_0", name="en", split="train"
|
src/sonicverse/scripts/xclip_build_finetune_dataset.py
CHANGED
@@ -5,7 +5,7 @@ import json
|
|
5 |
|
6 |
from datasets import Dataset, load_dataset
|
7 |
|
8 |
-
from
|
9 |
|
10 |
|
11 |
def _write_convo(row) -> List:
|
|
|
5 |
|
6 |
from datasets import Dataset, load_dataset
|
7 |
|
8 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
9 |
|
10 |
|
11 |
def _write_convo(row) -> List:
|
src/sonicverse/scripts/xclip_build_pretrain_dataset.py
CHANGED
@@ -6,7 +6,7 @@ import json
|
|
6 |
from huggingface_hub import hf_hub_download
|
7 |
from datasets import Dataset
|
8 |
|
9 |
-
from
|
10 |
|
11 |
PRETRAIN_PHRASES = [
|
12 |
"Repeat the content of the video <video>",
|
|
|
6 |
from huggingface_hub import hf_hub_download
|
7 |
from datasets import Dataset
|
8 |
|
9 |
+
from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
|
10 |
|
11 |
PRETRAIN_PHRASES = [
|
12 |
"Repeat the content of the video <video>",
|
src/sonicverse/setup.py
CHANGED
@@ -5,11 +5,11 @@ with open("requirements.txt") as f:
|
|
5 |
|
6 |
|
7 |
setup(
|
8 |
-
name="
|
9 |
-
version="0.0
|
10 |
description="",
|
11 |
-
url="https://github.com/
|
12 |
-
author="
|
13 |
license="Apache License 2.0",
|
14 |
packages=find_packages(),
|
15 |
include_package_data=True,
|
|
|
5 |
|
6 |
|
7 |
setup(
|
8 |
+
name="sonicverse",
|
9 |
+
version="1.0.0",
|
10 |
description="",
|
11 |
+
url="https://github.com/amaai-lab/SonicVerse",
|
12 |
+
author="Anuradha Chopra",
|
13 |
license="Apache License 2.0",
|
14 |
packages=find_packages(),
|
15 |
include_package_data=True,
|
src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
Metadata-Version: 2.1
|
2 |
-
Name:
|
3 |
-
Version: 0.0
|
4 |
-
Home-page: https://github.com/
|
5 |
-
Author:
|
6 |
License: Apache License 2.0
|
|
|
1 |
Metadata-Version: 2.1
|
2 |
+
Name: sonicverse
|
3 |
+
Version: 1.0.0
|
4 |
+
Home-page: https://github.com/amaai-lab/SonicVerse
|
5 |
+
Author: Anuradha Chopra
|
6 |
License: Apache License 2.0
|
src/sonicverse/sonicverse.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
setup.py
|
2 |
+
sonicverse.egg-info/PKG-INFO
|
3 |
+
sonicverse.egg-info/SOURCES.txt
|
4 |
+
sonicverse.egg-info/dependency_links.txt
|
5 |
+
sonicverse.egg-info/requires.txt
|
6 |
+
sonicverse.egg-info/top_level.txt
|
src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/dependency_links.txt
RENAMED
File without changes
|
src/sonicverse/sonicverse.egg-info/requires.txt
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
accelerate==0.29.3
|
3 |
+
aiofiles==23.2.1
|
4 |
+
aiohttp==3.9.5
|
5 |
+
aiosignal==1.3.1
|
6 |
+
altair==5.5.0
|
7 |
+
anyio==4.9.0
|
8 |
+
argbind==0.3.9
|
9 |
+
asttokens==2.4.1
|
10 |
+
async-timeout==4.0.3
|
11 |
+
attrs==23.2.0
|
12 |
+
audioread==3.0.1
|
13 |
+
bert-score==0.3.13
|
14 |
+
bitsandbytes==0.43.1
|
15 |
+
blinker==1.8.2
|
16 |
+
certifi==2024.7.4
|
17 |
+
cffi==1.16.0
|
18 |
+
charset-normalizer==3.3.2
|
19 |
+
click==8.1.7
|
20 |
+
contourpy==1.2.1
|
21 |
+
cycler==0.12.1
|
22 |
+
datasets==2.19.0
|
23 |
+
decorator==5.1.1
|
24 |
+
deepspeed==0.9.5
|
25 |
+
descript-audio-codec==1.0.0
|
26 |
+
descript-audiotools==0.7.2
|
27 |
+
dill==0.3.8
|
28 |
+
docstring_parser==0.16
|
29 |
+
einops==0.8.0
|
30 |
+
evaluate==0.4.3
|
31 |
+
exceptiongroup==1.2.2
|
32 |
+
executing==2.0.1
|
33 |
+
fastapi==0.115.12
|
34 |
+
ffmpy==0.3.2
|
35 |
+
filelock==3.15.4
|
36 |
+
fire==0.6.0
|
37 |
+
Flask==3.0.3
|
38 |
+
flatten-dict==0.4.2
|
39 |
+
fonttools==4.53.1
|
40 |
+
frozenlist==1.4.1
|
41 |
+
fsspec==2024.3.1
|
42 |
+
future==1.0.0
|
43 |
+
gradio==3.50.2
|
44 |
+
gradio_client==0.6.1
|
45 |
+
graphviz==0.20.3
|
46 |
+
grpcio==1.64.1
|
47 |
+
h11==0.16.0
|
48 |
+
hjson==3.1.0
|
49 |
+
httpcore==1.0.9
|
50 |
+
httpx==0.28.1
|
51 |
+
huggingface-hub==0.23.4
|
52 |
+
idna==3.7
|
53 |
+
importlib_metadata==8.0.0
|
54 |
+
importlib_resources==6.4.0
|
55 |
+
ipython==8.18.1
|
56 |
+
itsdangerous==2.2.0
|
57 |
+
jedi==0.19.1
|
58 |
+
Jinja2==3.1.4
|
59 |
+
joblib==1.4.2
|
60 |
+
jsonschema==4.23.0
|
61 |
+
jsonschema-specifications==2025.4.1
|
62 |
+
julius==0.2.7
|
63 |
+
kiwisolver==1.4.5
|
64 |
+
lazy_loader==0.4
|
65 |
+
librosa==0.10.2.post1
|
66 |
+
llvmlite==0.43.0
|
67 |
+
Markdown==3.6
|
68 |
+
markdown-it-py==3.0.0
|
69 |
+
markdown2==2.5.0
|
70 |
+
MarkupSafe==2.1.5
|
71 |
+
matplotlib==3.9.1
|
72 |
+
matplotlib-inline==0.1.7
|
73 |
+
mdurl==0.1.2
|
74 |
+
mpmath==1.3.0
|
75 |
+
msgpack==1.0.8
|
76 |
+
multidict==6.0.5
|
77 |
+
git+https://huggingface.co/spaces/annabeth97c/temp#egg=multi_token&subdirectory=src/sonicverse
|
78 |
+
multiprocess==0.70.16
|
79 |
+
narwhals==1.40.0
|
80 |
+
networkx==3.2.1
|
81 |
+
ninja==1.11.1.1
|
82 |
+
nltk==3.8.1
|
83 |
+
numba==0.60.0
|
84 |
+
numpy==1.26.4
|
85 |
+
nvidia-cublas-cu12==12.1.3.1
|
86 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
87 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
88 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
89 |
+
nvidia-cudnn-cu12==8.9.2.26
|
90 |
+
nvidia-cufft-cu12==11.0.2.54
|
91 |
+
nvidia-curand-cu12==10.3.2.106
|
92 |
+
nvidia-cusolver-cu12==11.4.5.107
|
93 |
+
nvidia-cusparse-cu12==12.1.0.106
|
94 |
+
nvidia-nccl-cu12==2.20.5
|
95 |
+
nvidia-nvjitlink-cu12==12.5.82
|
96 |
+
nvidia-nvtx-cu12==12.1.105
|
97 |
+
openai==1.82.0
|
98 |
+
orjson==3.10.18
|
99 |
+
packaging==24.1
|
100 |
+
pandas==2.2.2
|
101 |
+
parso==0.8.4
|
102 |
+
peft==0.10.0
|
103 |
+
pexpect==4.9.0
|
104 |
+
pillow==10.3.0
|
105 |
+
platformdirs==4.2.2
|
106 |
+
pooch==1.8.2
|
107 |
+
prompt_toolkit==3.0.47
|
108 |
+
protobuf==3.19.6
|
109 |
+
psutil==6.0.0
|
110 |
+
ptyprocess==0.7.0
|
111 |
+
pure-eval==0.2.2
|
112 |
+
py-cpuinfo==9.0.0
|
113 |
+
pyarrow==16.1.0
|
114 |
+
pyarrow-hotfix==0.6
|
115 |
+
pycparser==2.22
|
116 |
+
pydantic==1.10.17
|
117 |
+
pydub==0.25.1
|
118 |
+
Pygments==2.18.0
|
119 |
+
pyloudnorm==0.1.1
|
120 |
+
pyparsing==3.1.2
|
121 |
+
pystoi==0.4.1
|
122 |
+
python-dateutil==2.9.0.post0
|
123 |
+
python-multipart==0.0.20
|
124 |
+
pytz==2024.1
|
125 |
+
PyYAML==6.0.1
|
126 |
+
randomname==0.2.1
|
127 |
+
referencing==0.36.2
|
128 |
+
regex==2024.5.15
|
129 |
+
requests==2.32.3
|
130 |
+
rich==13.7.1
|
131 |
+
rouge_score==0.1.2
|
132 |
+
rpds-py==0.25.0
|
133 |
+
safetensors==0.4.3
|
134 |
+
scikit-learn==1.5.1
|
135 |
+
scipy==1.13.0
|
136 |
+
semantic-version==2.10.0
|
137 |
+
sentencepiece==0.2.0
|
138 |
+
six==1.16.0
|
139 |
+
sniffio==1.3.1
|
140 |
+
soundfile==0.12.1
|
141 |
+
soxr==0.3.7
|
142 |
+
stack-data==0.6.3
|
143 |
+
starlette==0.46.2
|
144 |
+
sympy==1.13.0
|
145 |
+
tensorboard==2.17.0
|
146 |
+
tensorboard-data-server==0.7.2
|
147 |
+
termcolor==2.4.0
|
148 |
+
threadpoolctl==3.5.0
|
149 |
+
tokenizers==0.19.1
|
150 |
+
torch==2.3.1
|
151 |
+
torch-stoi==0.2.1
|
152 |
+
torchaudio==2.3.1
|
153 |
+
torchviz==0.0.2
|
154 |
+
tqdm==4.66.4
|
155 |
+
traitlets==5.14.3
|
156 |
+
transformers==4.40.1
|
157 |
+
triton==2.3.1
|
158 |
+
typing_extensions==4.12.2
|
159 |
+
tzdata==2024.1
|
160 |
+
urllib3==2.2.2
|
161 |
+
uvicorn==0.34.2
|
162 |
+
wcwidth==0.2.13
|
163 |
+
websockets==11.0.3
|
164 |
+
Werkzeug==3.0.3
|
165 |
+
xxhash==3.4.1
|
166 |
+
yarl==1.9.4
|
167 |
+
zipp==3.19.2
|
src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/top_level.txt
RENAMED
File without changes
|
src/sonicverse/{multi_token → sonicverse}/constants.py
RENAMED
File without changes
|
src/sonicverse/{multi_token → sonicverse}/data_tools.py
RENAMED
@@ -18,7 +18,7 @@ import numpy as np
|
|
18 |
from datasets import load_dataset, Dataset
|
19 |
from PIL import Image
|
20 |
|
21 |
-
from
|
22 |
|
23 |
|
24 |
def encode_chat(
|
|
|
18 |
from datasets import load_dataset, Dataset
|
19 |
from PIL import Image
|
20 |
|
21 |
+
from sonicverse.constants import IGNORE_INDEX
|
22 |
|
23 |
|
24 |
def encode_chat(
|
src/sonicverse/{multi_token → sonicverse}/inference.py
RENAMED
@@ -7,11 +7,11 @@ from peft import PeftModel
|
|
7 |
import torch
|
8 |
import os
|
9 |
|
10 |
-
from
|
11 |
-
from
|
12 |
-
from
|
13 |
-
from
|
14 |
-
from
|
15 |
|
16 |
|
17 |
def load_trained_lora_model(
|
|
|
7 |
import torch
|
8 |
import os
|
9 |
|
10 |
+
from sonicverse.model_utils import fix_tokenizer, MultiTaskType
|
11 |
+
from sonicverse.modalities.base_modality import Modality
|
12 |
+
from sonicverse.language_models.mistral import MistralForCausalLM
|
13 |
+
from sonicverse.language_models import LANGUAGE_MODEL_NAME_TO_CLASS
|
14 |
+
from sonicverse.modalities import MODALITY_BUILDERS
|
15 |
|
16 |
|
17 |
def load_trained_lora_model(
|
src/sonicverse/{multi_token → sonicverse}/language_models/__init__.py
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
from
|
2 |
MistralLMMForCausalLM,
|
3 |
)
|
4 |
|
|
|
1 |
+
from sonicverse.language_models.mistral import (
|
2 |
MistralLMMForCausalLM,
|
3 |
)
|
4 |
|
src/sonicverse/{multi_token → sonicverse}/language_models/base_model.py
RENAMED
@@ -5,8 +5,8 @@ from torch.nn.functional import conv1d
|
|
5 |
import torch
|
6 |
import logging
|
7 |
|
8 |
-
from
|
9 |
-
from
|
10 |
|
11 |
from torchviz import make_dot
|
12 |
|
|
|
5 |
import torch
|
6 |
import logging
|
7 |
|
8 |
+
from sonicverse.modalities.base_modality import Modality
|
9 |
+
from sonicverse.model_utils import MultiTaskType
|
10 |
|
11 |
from torchviz import make_dot
|
12 |
|
src/sonicverse/{multi_token → sonicverse}/language_models/mistral.py
RENAMED
@@ -15,7 +15,7 @@ from transformers import (
|
|
15 |
|
16 |
from transformers.modeling_outputs import CausalLMOutputWithPast
|
17 |
|
18 |
-
from
|
19 |
LMMMetaModel,
|
20 |
LMMMetaForCausalLM,
|
21 |
)
|
|
|
15 |
|
16 |
from transformers.modeling_outputs import CausalLMOutputWithPast
|
17 |
|
18 |
+
from sonicverse.language_models.base_model import (
|
19 |
LMMMetaModel,
|
20 |
LMMMetaForCausalLM,
|
21 |
)
|
src/sonicverse/{multi_token → sonicverse}/modalities/__init__.py
RENAMED
@@ -1,15 +1,15 @@
|
|
1 |
-
from
|
2 |
-
from
|
3 |
CLIPVisionModality,
|
4 |
OUTPUT_LAYER as CLIP_POOL_LAYER,
|
5 |
)
|
6 |
-
from
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
-
from
|
12 |
-
from
|
13 |
|
14 |
MODALITY_BUILDERS = {
|
15 |
"vision_clip": lambda: [CLIPVisionModality()],
|
|
|
1 |
+
from sonicverse.model_utils import MultiTaskType
|
2 |
+
from sonicverse.modalities.vision_clip import (
|
3 |
CLIPVisionModality,
|
4 |
OUTPUT_LAYER as CLIP_POOL_LAYER,
|
5 |
)
|
6 |
+
from sonicverse.modalities.imagebind import ImageBindModality
|
7 |
+
from sonicverse.modalities.document_gte import DocumentGTEModality
|
8 |
+
from sonicverse.modalities.audio_whisper import WhisperAudioModality
|
9 |
+
from sonicverse.modalities.audio_clap import CLAPAudioModality
|
10 |
+
from sonicverse.modalities.video_xclip import XCLIPVideoModality
|
11 |
+
from sonicverse.modalities.audio_descript import DescriptAudioModality
|
12 |
+
from sonicverse.modalities.audio_mert import MERTAudioModality
|
13 |
|
14 |
MODALITY_BUILDERS = {
|
15 |
"vision_clip": lambda: [CLIPVisionModality()],
|
src/sonicverse/{multi_token → sonicverse}/modalities/audio_clap.py
RENAMED
@@ -4,10 +4,10 @@ import torch
|
|
4 |
import torch.nn as nn
|
5 |
from transformers import ClapModel, ClapProcessor
|
6 |
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
build_mlp_vector_projector, build_mt_vector_projector, MultiTaskModel
|
12 |
)
|
13 |
|
|
|
4 |
import torch.nn as nn
|
5 |
from transformers import ClapModel, ClapProcessor
|
6 |
|
7 |
+
from sonicverse.model_utils import MultiTaskType
|
8 |
+
from sonicverse.data_tools import load_audio
|
9 |
+
from sonicverse.modalities.base_modality import Modality
|
10 |
+
from sonicverse.modalities.projectors import (
|
11 |
build_mlp_vector_projector, build_mt_vector_projector, MultiTaskModel
|
12 |
)
|
13 |
|
src/sonicverse/{multi_token → sonicverse}/modalities/audio_descript.py
RENAMED
@@ -5,10 +5,10 @@ import torch.nn as nn
|
|
5 |
import dac
|
6 |
from audiotools import AudioSignal
|
7 |
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
-
from
|
12 |
build_mlp_vector_projector, build_attentive_cnn_projector, build_cnn_mlp_projector, MultiTaskModel
|
13 |
)
|
14 |
|
|
|
5 |
import dac
|
6 |
from audiotools import AudioSignal
|
7 |
|
8 |
+
from sonicverse.model_utils import MultiTaskType
|
9 |
+
from sonicverse.data_tools import load_audio_signal
|
10 |
+
from sonicverse.modalities.base_modality import Modality
|
11 |
+
from sonicverse.modalities.projectors import (
|
12 |
build_mlp_vector_projector, build_attentive_cnn_projector, build_cnn_mlp_projector, MultiTaskModel
|
13 |
)
|
14 |
|
src/sonicverse/{multi_token → sonicverse}/modalities/audio_descript_bu.py
RENAMED
@@ -6,9 +6,9 @@ import dac
|
|
6 |
from audiotools import AudioSignal
|
7 |
|
8 |
|
9 |
-
from
|
10 |
-
from
|
11 |
-
from
|
12 |
build_mlp_vector_projector, build_attentive_cnn_projector, build_cnn_mlp_projector
|
13 |
)
|
14 |
|
|
|
6 |
from audiotools import AudioSignal
|
7 |
|
8 |
|
9 |
+
from sonicverse.data_tools import load_audio_signal
|
10 |
+
from sonicverse.modalities.base_modality import Modality
|
11 |
+
from sonicverse.modalities.projectors import (
|
12 |
build_mlp_vector_projector, build_attentive_cnn_projector, build_cnn_mlp_projector
|
13 |
)
|
14 |
|
src/sonicverse/{multi_token → sonicverse}/modalities/audio_mert.py
RENAMED
@@ -4,13 +4,13 @@ import torch
|
|
4 |
import torch.nn as nn
|
5 |
from transformers import Wav2Vec2FeatureExtractor, AutoModel
|
6 |
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
build_mlp_vector_projector, build_mt_vector_projector, build_multi_layer_cnn_mlp_projector, MultiTaskModel
|
12 |
)
|
13 |
-
from
|
14 |
|
15 |
import json
|
16 |
|
|
|
4 |
import torch.nn as nn
|
5 |
from transformers import Wav2Vec2FeatureExtractor, AutoModel
|
6 |
|
7 |
+
from sonicverse.model_utils import MultiTaskType
|
8 |
+
from sonicverse.data_tools import load_audio
|
9 |
+
from sonicverse.modalities.base_modality import Modality
|
10 |
+
from sonicverse.modalities.projectors import (
|
11 |
build_mlp_vector_projector, build_mt_vector_projector, build_multi_layer_cnn_mlp_projector, MultiTaskModel
|
12 |
)
|
13 |
+
from sonicverse.modalities.multi_task_projector_shared import MultiTaskSharedModel
|
14 |
|
15 |
import json
|
16 |
|
src/sonicverse/{multi_token → sonicverse}/modalities/audio_mert_bu.py
RENAMED
@@ -4,10 +4,10 @@ import torch
|
|
4 |
import torch.nn as nn
|
5 |
from transformers import Wav2Vec2FeatureExtractor, AutoModel
|
6 |
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
build_mlp_vector_projector, build_mt_vector_projector, build_multi_layer_cnn_mlp_projector, MultiTaskModel
|
12 |
)
|
13 |
|
|
|
4 |
import torch.nn as nn
|
5 |
from transformers import Wav2Vec2FeatureExtractor, AutoModel
|
6 |
|
7 |
+
from sonicverse.model_utils import MultiTaskType
|
8 |
+
from sonicverse.data_tools import load_audio
|
9 |
+
from sonicverse.modalities.base_modality import Modality
|
10 |
+
from sonicverse.modalities.projectors import (
|
11 |
build_mlp_vector_projector, build_mt_vector_projector, build_multi_layer_cnn_mlp_projector, MultiTaskModel
|
12 |
)
|
13 |
|
src/sonicverse/{multi_token → sonicverse}/modalities/audio_whisper.py
RENAMED
@@ -4,9 +4,9 @@ import torch
|
|
4 |
import torch.nn as nn
|
5 |
from transformers import AutoFeatureExtractor, WhisperModel
|
6 |
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
build_mlp_vector_projector,
|
11 |
)
|
12 |
|
|
|
4 |
import torch.nn as nn
|
5 |
from transformers import AutoFeatureExtractor, WhisperModel
|
6 |
|
7 |
+
from sonicverse.data_tools import load_audio
|
8 |
+
from sonicverse.modalities.base_modality import Modality
|
9 |
+
from sonicverse.modalities.projectors import (
|
10 |
build_mlp_vector_projector,
|
11 |
)
|
12 |
|