annabeth97c commited on
Commit
4401dfb
·
1 Parent(s): 700526e

feat: Update app for long audio captioning and chaining

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +89 -1
  2. app.py +86 -9
  3. requirements.txt +2 -0
  4. src/sonicverse/multi_token.egg-info/SOURCES.txt +0 -6
  5. src/sonicverse/multi_token.egg-info/requires.txt +0 -8
  6. src/sonicverse/requirements.txt +166 -7
  7. src/sonicverse/scripts/clap_gpt_build_finetune_dataset.py +1 -1
  8. src/sonicverse/scripts/clap_gpt_build_pretrain_dataset.py +1 -1
  9. src/sonicverse/scripts/document_build_finetune_dataset.py +2 -2
  10. src/sonicverse/scripts/document_build_pretrain_dataset.py +2 -2
  11. src/sonicverse/scripts/evaluate_model.py +4 -4
  12. src/sonicverse/scripts/evaluate_model_latest.py +4 -4
  13. src/sonicverse/scripts/evaluate_model_mullama.py +4 -4
  14. src/sonicverse/scripts/evaluate_model_mullama_musiccaps.py +4 -4
  15. src/sonicverse/scripts/evaluate_model_mullama_musiccaps_fixed_prompt.py +4 -4
  16. src/sonicverse/scripts/evaluate_mullama.py +4 -4
  17. src/sonicverse/scripts/evaluate_temp.py +4 -4
  18. src/sonicverse/scripts/gym_lunar_lander_build_dataset.py +1 -1
  19. src/sonicverse/scripts/gym_lunar_lander_client.py +1 -1
  20. src/sonicverse/scripts/imagebind_build_llava_finetune_dataset.py +1 -1
  21. src/sonicverse/scripts/imagebind_build_llava_pretrain_dataset.py +1 -1
  22. src/sonicverse/scripts/llava_build_finetune_dataset.py +1 -1
  23. src/sonicverse/scripts/llava_build_pretrain_dataset.py +1 -1
  24. src/sonicverse/scripts/llava_gpt_build_multi_image_finetune_dataset.py +1 -1
  25. src/sonicverse/scripts/serve_model.py +4 -4
  26. src/sonicverse/scripts/serve_model_gradio.py +4 -4
  27. src/sonicverse/scripts/train_model.py +5 -5
  28. src/sonicverse/scripts/whisper_build_pretrain_dataset.py +1 -1
  29. src/sonicverse/scripts/whisper_gpt_build_finetune_dataset.py +1 -1
  30. src/sonicverse/scripts/xclip_build_finetune_dataset.py +1 -1
  31. src/sonicverse/scripts/xclip_build_pretrain_dataset.py +1 -1
  32. src/sonicverse/setup.py +4 -4
  33. src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/PKG-INFO +4 -4
  34. src/sonicverse/sonicverse.egg-info/SOURCES.txt +6 -0
  35. src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/dependency_links.txt +0 -0
  36. src/sonicverse/sonicverse.egg-info/requires.txt +167 -0
  37. src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/top_level.txt +0 -0
  38. src/sonicverse/{multi_token → sonicverse}/constants.py +0 -0
  39. src/sonicverse/{multi_token → sonicverse}/data_tools.py +1 -1
  40. src/sonicverse/{multi_token → sonicverse}/inference.py +5 -5
  41. src/sonicverse/{multi_token → sonicverse}/language_models/__init__.py +1 -1
  42. src/sonicverse/{multi_token → sonicverse}/language_models/base_model.py +2 -2
  43. src/sonicverse/{multi_token → sonicverse}/language_models/mistral.py +1 -1
  44. src/sonicverse/{multi_token → sonicverse}/modalities/__init__.py +9 -9
  45. src/sonicverse/{multi_token → sonicverse}/modalities/audio_clap.py +4 -4
  46. src/sonicverse/{multi_token → sonicverse}/modalities/audio_descript.py +4 -4
  47. src/sonicverse/{multi_token → sonicverse}/modalities/audio_descript_bu.py +3 -3
  48. src/sonicverse/{multi_token → sonicverse}/modalities/audio_mert.py +5 -5
  49. src/sonicverse/{multi_token → sonicverse}/modalities/audio_mert_bu.py +4 -4
  50. src/sonicverse/{multi_token → sonicverse}/modalities/audio_whisper.py +3 -3
README.md CHANGED
@@ -9,4 +9,92 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  pinned: false
10
  ---
11
 
12
+ # 🎼 SonicVerse
13
+
14
+ An interactive demo for SonicVerse, a music captioning model, allowing users to input audio of up to 10 seconds and generate a natural language caption
15
+ that includes a general description of the music as well as music features such as key, instruments, genre, mood / theme, vocals gender.
16
+
17
+ ---
18
+
19
+ ## 🚀 Demo
20
+
21
+ Check out the live Space here:
22
+ [![Hugging Face Space](https://img.shields.io/badge/HuggingFace-Space-blue?logo=huggingface)](https://huggingface.co/spaces/annabeth97c/SonicVerse)
23
+
24
+ ---
25
+
26
+ ## 🚀 Samples
27
+
28
+ Short captions
29
+
30
+ ---
31
+
32
+ ## 📦 Features
33
+
34
+ ✅ Upload a 10 second music clip and get a caption
35
+
36
+ ✅ Upload a long music clip (upto 1 minute for successful demo) to get a long detailed caption for the whole music clip.
37
+
38
+ ---
39
+
40
+ ## 🛠️ How to Run Locally
41
+
42
+ ```bash
43
+ # Clone the repo
44
+ git clone https://github.com/AMAAI-Lab/SonicVerse
45
+ cd SonicVerse
46
+
47
+ # Install dependencies
48
+ pip install -r requirements.txt
49
+
50
+ # Alternatively, set up conda environment
51
+ conda env create -f environment.yml
52
+ conda activate sonicverse
53
+
54
+ # Run the app
55
+ python app.py
56
+ ```
57
+
58
+ ---
59
+
60
+ <!-- ## 📂 File Structure
61
+
62
+ ```
63
+ .
64
+ ├── app.py # Web app file
65
+ ├── requirements.txt # Python dependencies
66
+ ├── environment.yml # Conda environment
67
+ ├── README.md # This file
68
+ └── src/sonicverse # Source
69
+ ```
70
+
71
+ --- -->
72
+
73
+ ## 💡 Usage
74
+
75
+ To use the app:
76
+ 1. Select audio clip to input
77
+ 2. Click the **Generate** button.
78
+ 3. See the model’s output below.
79
+
80
+ ---
81
+
82
+ ## 🧹 Built With
83
+
84
+ - [Hugging Face Spaces](https://huggingface.co/spaces)
85
+ - [Gradio](https://gradio.app/)
86
+ - [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
87
+ - [MERT 95M](https://huggingface.co/m-a-p/MERT-v1-95M)
88
+ ---
89
+
90
+ <!-- ## ✨ Acknowledgements
91
+
92
+ - [Model authors or papers you built on]
93
+ - [Contributors or collaborators]
94
+
95
+ ---
96
+
97
+ ## 📜 License
98
+
99
+ This project is licensed under the MIT License / Apache 2.0 / Other.
100
+ -->
app.py CHANGED
@@ -18,11 +18,17 @@ import torch
18
  import transformers
19
  import torchaudio
20
 
21
- from multi_token.model_utils import MultiTaskType
22
- from multi_token.training import ModelArguments
23
- from multi_token.inference import load_trained_lora_model
24
- from multi_token.data_tools import encode_chat
25
 
 
 
 
 
 
 
26
 
27
  @dataclass
28
  class ServeArguments(ModelArguments):
@@ -31,7 +37,6 @@ class ServeArguments(ModelArguments):
31
  temperature: float = field(default=0.01)
32
 
33
 
34
- # Load arguments and model
35
  logging.getLogger().setLevel(logging.INFO)
36
 
37
  parser = transformers.HfArgumentParser((ServeArguments,))
@@ -45,10 +50,82 @@ model, tokenizer = load_trained_lora_model(
45
  tasks_config=serve_args.tasks_config
46
  )
47
 
48
- @spaces.GPU(duration=60)
49
- def generate_caption(audio_file):
50
- # waveform, sample_rate = torchaudio.load(audio_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
 
52
  req_json = {
53
  "messages": [
54
  {"role": "user", "content": "Describe the music. <sound>"}
@@ -79,7 +156,7 @@ def generate_caption(audio_file):
79
 
80
 
81
  demo = gr.Interface(
82
- fn=generate_caption,
83
  inputs=gr.Audio(type="filepath", label="Upload an audio file"),
84
  outputs=gr.Textbox(label="Generated Caption"),
85
  title="SonicVerse",
 
18
  import transformers
19
  import torchaudio
20
 
21
+ from openai import OpenAI
22
+ client = OpenAI()
23
+ MODEL = "gpt-4"
24
+ SLEEP_BETWEEN_CALLS = 1.0
25
 
26
+ from sonicverse.model_utils import MultiTaskType
27
+ from sonicverse.training import ModelArguments
28
+ from sonicverse.inference import load_trained_lora_model
29
+ from sonicverse.data_tools import encode_chat
30
+
31
+ CHUNK_LENGTH = 10
32
 
33
  @dataclass
34
  class ServeArguments(ModelArguments):
 
37
  temperature: float = field(default=0.01)
38
 
39
 
 
40
  logging.getLogger().setLevel(logging.INFO)
41
 
42
  parser = transformers.HfArgumentParser((ServeArguments,))
 
50
  tasks_config=serve_args.tasks_config
51
  )
52
 
53
+ def caption_audio(audio_file):
54
+ chunk_audio_files = split_audio(audio_file, CHUNK_LENGTH)
55
+ chunk_captions = []
56
+ for audio_chunk in chunk_audio_files:
57
+ chunk_captions.append(generate_caption(audio_chunk))
58
+
59
+ if len(chunk_captions) > 1:
60
+ audio_name = os.path.splitext(os.path.basename(audio_file))[0]
61
+ long_caption = summarize_song(audio_name, chunk_captions)
62
+
63
+ delete_files(chunk_audio_files)
64
+
65
+ return long_caption
66
+
67
+ else:
68
+ if len(chunk_captions) == 1:
69
+ return chunk_captions[0]
70
+ else:
71
+ return ""
72
+
73
+ def summarize_song(song_name, chunks):
74
+ prompt = f"""
75
+ You are a music critic. Given the following chronological 10‑second chunk descriptions of a single piece, write one flowing, detailed description of the entire song—its structure, instrumentation, and standout moments. Mention transition points in terms of time stamps. If the description of certain chunks does not seem to fit with those for the chunks before and after, treat those as bad descriptions with lower accuracy and do not incorporate the information. Retain concrete musical attributes such as key, chords, tempo.
76
+
77
+ Chunks for “{song_name} ”:
78
+ """
79
+ for i, c in enumerate(chunks, 1):
80
+ prompt += f"\n {(i - 1)*0} to {i*10} seconds. {c.strip()}"
81
+ prompt += "\n\nFull song description:"
82
+
83
+ resp = client.chat.completions.create(model=MODEL,
84
+ messages=[
85
+ {"role": "system", "content": "You are an expert music writer."},
86
+ {"role": "user", "content": prompt}
87
+ ],
88
+ temperature=0.0,
89
+ max_tokens=1000)
90
+ return resp.choices[0].message.content.strip()
91
+
92
+ def delete_files(file_paths):
93
+ for path in file_paths:
94
+ try:
95
+ if os.path.isfile(path):
96
+ os.remove(path)
97
+ print(f"Deleted: {path}")
98
+ else:
99
+ print(f"Skipped (not a file or doesn't exist): {path}")
100
+ except Exception as e:
101
+ print(f"Error deleting {path}: {e}")
102
+
103
+ def split_audio(input_path, chunk_length_seconds):
104
+
105
+ waveform, sample_rate = torchaudio.load(input_path)
106
+ num_channels, total_samples = waveform.shape
107
+ chunk_samples = int(chunk_length_seconds * sample_rate)
108
+
109
+ num_chunks = (total_samples + chunk_samples - 1) // chunk_samples
110
+
111
+ base, ext = os.path.splitext(input_path)
112
+ output_paths = []
113
+
114
+ if (num_chunks <= 1):
115
+ return [input_path]
116
+
117
+ for i in range(num_chunks):
118
+ start = i * chunk_samples
119
+ end = min((i + 1) * chunk_samples, total_samples)
120
+ chunk_waveform = waveform[:, start:end]
121
+
122
+ output_file = f"{base}_{i+1:03d}{ext}"
123
+ torchaudio.save(output_file, chunk_waveform, sample_rate)
124
+ output_paths.append(output_file)
125
+
126
+ return output_paths
127
 
128
+ def generate_caption(audio_file):
129
  req_json = {
130
  "messages": [
131
  {"role": "user", "content": "Describe the music. <sound>"}
 
156
 
157
 
158
  demo = gr.Interface(
159
+ fn=caption_audio,
160
  inputs=gr.Audio(type="filepath", label="Upload an audio file"),
161
  outputs=gr.Textbox(label="Generated Caption"),
162
  title="SonicVerse",
requirements.txt CHANGED
@@ -74,6 +74,7 @@ mdurl==0.1.2
74
  mpmath==1.3.0
75
  msgpack==1.0.8
76
  multidict==6.0.5
 
77
  multiprocess==0.70.16
78
  narwhals==1.40.0
79
  networkx==3.2.1
@@ -93,6 +94,7 @@ nvidia-cusparse-cu12==12.1.0.106
93
  nvidia-nccl-cu12==2.20.5
94
  nvidia-nvjitlink-cu12==12.5.82
95
  nvidia-nvtx-cu12==12.1.105
 
96
  orjson==3.10.18
97
  packaging==24.1
98
  pandas==2.2.2
 
74
  mpmath==1.3.0
75
  msgpack==1.0.8
76
  multidict==6.0.5
77
+ git+https://huggingface.co/spaces/annabeth97c/temp#egg=multi_token&subdirectory=src/sonicverse
78
  multiprocess==0.70.16
79
  narwhals==1.40.0
80
  networkx==3.2.1
 
94
  nvidia-nccl-cu12==2.20.5
95
  nvidia-nvjitlink-cu12==12.5.82
96
  nvidia-nvtx-cu12==12.1.105
97
+ openai==1.82.0
98
  orjson==3.10.18
99
  packaging==24.1
100
  pandas==2.2.2
src/sonicverse/multi_token.egg-info/SOURCES.txt DELETED
@@ -1,6 +0,0 @@
1
- setup.py
2
- multi_token.egg-info/PKG-INFO
3
- multi_token.egg-info/SOURCES.txt
4
- multi_token.egg-info/dependency_links.txt
5
- multi_token.egg-info/requires.txt
6
- multi_token.egg-info/top_level.txt
 
 
 
 
 
 
 
src/sonicverse/multi_token.egg-info/requires.txt DELETED
@@ -1,8 +0,0 @@
1
- transformers>=4.34.0
2
- accelerate>=0.21.0
3
- scipy>=1.11.3
4
- bitsandbytes>=0.41.0
5
- datasets>=2.14.5
6
- sentencepiece>=0.1.99
7
- peft>=0.4.0
8
- deepspeed==0.9.5
 
 
 
 
 
 
 
 
 
src/sonicverse/requirements.txt CHANGED
@@ -1,8 +1,167 @@
1
- transformers>=4.34.0
2
- accelerate>=0.21.0
3
- scipy>=1.11.3
4
- bitsandbytes>=0.41.0
5
- datasets>=2.14.5
6
- sentencepiece>=0.1.99
7
- peft>=0.4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  deepspeed==0.9.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.29.3
3
+ aiofiles==23.2.1
4
+ aiohttp==3.9.5
5
+ aiosignal==1.3.1
6
+ altair==5.5.0
7
+ anyio==4.9.0
8
+ argbind==0.3.9
9
+ asttokens==2.4.1
10
+ async-timeout==4.0.3
11
+ attrs==23.2.0
12
+ audioread==3.0.1
13
+ bert-score==0.3.13
14
+ bitsandbytes==0.43.1
15
+ blinker==1.8.2
16
+ certifi==2024.7.4
17
+ cffi==1.16.0
18
+ charset-normalizer==3.3.2
19
+ click==8.1.7
20
+ contourpy==1.2.1
21
+ cycler==0.12.1
22
+ datasets==2.19.0
23
+ decorator==5.1.1
24
  deepspeed==0.9.5
25
+ descript-audio-codec==1.0.0
26
+ descript-audiotools==0.7.2
27
+ dill==0.3.8
28
+ docstring_parser==0.16
29
+ einops==0.8.0
30
+ evaluate==0.4.3
31
+ exceptiongroup==1.2.2
32
+ executing==2.0.1
33
+ fastapi==0.115.12
34
+ ffmpy==0.3.2
35
+ filelock==3.15.4
36
+ fire==0.6.0
37
+ Flask==3.0.3
38
+ flatten-dict==0.4.2
39
+ fonttools==4.53.1
40
+ frozenlist==1.4.1
41
+ fsspec==2024.3.1
42
+ future==1.0.0
43
+ gradio==3.50.2
44
+ gradio_client==0.6.1
45
+ graphviz==0.20.3
46
+ grpcio==1.64.1
47
+ h11==0.16.0
48
+ hjson==3.1.0
49
+ httpcore==1.0.9
50
+ httpx==0.28.1
51
+ huggingface-hub==0.23.4
52
+ idna==3.7
53
+ importlib_metadata==8.0.0
54
+ importlib_resources==6.4.0
55
+ ipython==8.18.1
56
+ itsdangerous==2.2.0
57
+ jedi==0.19.1
58
+ Jinja2==3.1.4
59
+ joblib==1.4.2
60
+ jsonschema==4.23.0
61
+ jsonschema-specifications==2025.4.1
62
+ julius==0.2.7
63
+ kiwisolver==1.4.5
64
+ lazy_loader==0.4
65
+ librosa==0.10.2.post1
66
+ llvmlite==0.43.0
67
+ Markdown==3.6
68
+ markdown-it-py==3.0.0
69
+ markdown2==2.5.0
70
+ MarkupSafe==2.1.5
71
+ matplotlib==3.9.1
72
+ matplotlib-inline==0.1.7
73
+ mdurl==0.1.2
74
+ mpmath==1.3.0
75
+ msgpack==1.0.8
76
+ multidict==6.0.5
77
+ git+https://huggingface.co/spaces/annabeth97c/temp#egg=multi_token&subdirectory=src/sonicverse
78
+ multiprocess==0.70.16
79
+ narwhals==1.40.0
80
+ networkx==3.2.1
81
+ ninja==1.11.1.1
82
+ nltk==3.8.1
83
+ numba==0.60.0
84
+ numpy==1.26.4
85
+ nvidia-cublas-cu12==12.1.3.1
86
+ nvidia-cuda-cupti-cu12==12.1.105
87
+ nvidia-cuda-nvrtc-cu12==12.1.105
88
+ nvidia-cuda-runtime-cu12==12.1.105
89
+ nvidia-cudnn-cu12==8.9.2.26
90
+ nvidia-cufft-cu12==11.0.2.54
91
+ nvidia-curand-cu12==10.3.2.106
92
+ nvidia-cusolver-cu12==11.4.5.107
93
+ nvidia-cusparse-cu12==12.1.0.106
94
+ nvidia-nccl-cu12==2.20.5
95
+ nvidia-nvjitlink-cu12==12.5.82
96
+ nvidia-nvtx-cu12==12.1.105
97
+ openai==1.82.0
98
+ orjson==3.10.18
99
+ packaging==24.1
100
+ pandas==2.2.2
101
+ parso==0.8.4
102
+ peft==0.10.0
103
+ pexpect==4.9.0
104
+ pillow==10.3.0
105
+ platformdirs==4.2.2
106
+ pooch==1.8.2
107
+ prompt_toolkit==3.0.47
108
+ protobuf==3.19.6
109
+ psutil==6.0.0
110
+ ptyprocess==0.7.0
111
+ pure-eval==0.2.2
112
+ py-cpuinfo==9.0.0
113
+ pyarrow==16.1.0
114
+ pyarrow-hotfix==0.6
115
+ pycparser==2.22
116
+ pydantic==1.10.17
117
+ pydub==0.25.1
118
+ Pygments==2.18.0
119
+ pyloudnorm==0.1.1
120
+ pyparsing==3.1.2
121
+ pystoi==0.4.1
122
+ python-dateutil==2.9.0.post0
123
+ python-multipart==0.0.20
124
+ pytz==2024.1
125
+ PyYAML==6.0.1
126
+ randomname==0.2.1
127
+ referencing==0.36.2
128
+ regex==2024.5.15
129
+ requests==2.32.3
130
+ rich==13.7.1
131
+ rouge_score==0.1.2
132
+ rpds-py==0.25.0
133
+ safetensors==0.4.3
134
+ scikit-learn==1.5.1
135
+ scipy==1.13.0
136
+ semantic-version==2.10.0
137
+ sentencepiece==0.2.0
138
+ six==1.16.0
139
+ sniffio==1.3.1
140
+ soundfile==0.12.1
141
+ soxr==0.3.7
142
+ stack-data==0.6.3
143
+ starlette==0.46.2
144
+ sympy==1.13.0
145
+ tensorboard==2.17.0
146
+ tensorboard-data-server==0.7.2
147
+ termcolor==2.4.0
148
+ threadpoolctl==3.5.0
149
+ tokenizers==0.19.1
150
+ torch==2.3.1
151
+ torch-stoi==0.2.1
152
+ torchaudio==2.3.1
153
+ torchviz==0.0.2
154
+ tqdm==4.66.4
155
+ traitlets==5.14.3
156
+ transformers==4.40.1
157
+ triton==2.3.1
158
+ typing_extensions==4.12.2
159
+ tzdata==2024.1
160
+ urllib3==2.2.2
161
+ uvicorn==0.34.2
162
+ wcwidth==0.2.13
163
+ websockets==11.0.3
164
+ Werkzeug==3.0.3
165
+ xxhash==3.4.1
166
+ yarl==1.9.4
167
+ zipp==3.19.2
src/sonicverse/scripts/clap_gpt_build_finetune_dataset.py CHANGED
@@ -7,7 +7,7 @@ import openai
7
 
8
  from datasets import Dataset, load_dataset
9
 
10
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
11
 
12
  PROMPT = """
13
  You are helping train a sound assistant that can take audio inputs and output text.
 
7
 
8
  from datasets import Dataset, load_dataset
9
 
10
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
11
 
12
  PROMPT = """
13
  You are helping train a sound assistant that can take audio inputs and output text.
src/sonicverse/scripts/clap_gpt_build_pretrain_dataset.py CHANGED
@@ -7,7 +7,7 @@ import openai
7
 
8
  from datasets import Dataset, load_dataset
9
 
10
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
11
 
12
  PROMPT = """
13
  You are helping write captions for audio clips.
 
7
 
8
  from datasets import Dataset, load_dataset
9
 
10
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
11
 
12
  PROMPT = """
13
  You are helping write captions for audio clips.
src/sonicverse/scripts/document_build_finetune_dataset.py CHANGED
@@ -7,8 +7,8 @@ import json
7
  from datasets import load_dataset
8
  from datasets import Dataset
9
 
10
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
11
- from multi_token.modalities.document_gte import (
12
  split_text_into_documents,
13
  )
14
 
 
7
  from datasets import load_dataset
8
  from datasets import Dataset
9
 
10
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
11
+ from sonicverse.modalities.document_gte import (
12
  split_text_into_documents,
13
  )
14
 
src/sonicverse/scripts/document_build_pretrain_dataset.py CHANGED
@@ -5,8 +5,8 @@ import argparse
5
  from datasets import load_dataset
6
  from datasets import Dataset
7
 
8
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
9
- from multi_token.modalities.document_gte import (
10
  split_text_into_documents,
11
  )
12
 
 
5
  from datasets import load_dataset
6
  from datasets import Dataset
7
 
8
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
9
+ from sonicverse.modalities.document_gte import (
10
  split_text_into_documents,
11
  )
12
 
src/sonicverse/scripts/evaluate_model.py CHANGED
@@ -7,12 +7,12 @@ import torch
7
 
8
  from datasets import load_from_disk
9
 
10
- from multi_token.model_utils import MultiTaskType
11
- from multi_token.training import (
12
  ModelArguments,
13
  )
14
- from multi_token.inference import load_trained_lora_model
15
- from multi_token.data_tools import encode_chat
16
 
17
  import evaluate
18
 
 
7
 
8
  from datasets import load_from_disk
9
 
10
+ from sonicverse.model_utils import MultiTaskType
11
+ from sonicverse.training import (
12
  ModelArguments,
13
  )
14
+ from sonicverse.inference import load_trained_lora_model
15
+ from sonicverse.data_tools import encode_chat
16
 
17
  import evaluate
18
 
src/sonicverse/scripts/evaluate_model_latest.py CHANGED
@@ -5,10 +5,10 @@ from flask import Flask, request, jsonify
5
  import transformers
6
  import torch
7
  from datasets import load_from_disk
8
- from multi_token.model_utils import MultiTaskType
9
- from multi_token.training import ModelArguments
10
- from multi_token.inference import load_trained_lora_model
11
- from multi_token.data_tools import encode_chat
12
  import evaluate
13
  import random
14
  import bert_score
 
5
  import transformers
6
  import torch
7
  from datasets import load_from_disk
8
+ from sonicverse.model_utils import MultiTaskType
9
+ from sonicverse.training import ModelArguments
10
+ from sonicverse.inference import load_trained_lora_model
11
+ from sonicverse.data_tools import encode_chat
12
  import evaluate
13
  import random
14
  import bert_score
src/sonicverse/scripts/evaluate_model_mullama.py CHANGED
@@ -5,10 +5,10 @@ from flask import Flask, request, jsonify
5
  import transformers
6
  import torch
7
  from datasets import load_from_disk
8
- from multi_token.model_utils import MultiTaskType
9
- from multi_token.training import ModelArguments
10
- from multi_token.inference import load_trained_lora_model
11
- from multi_token.data_tools import encode_chat
12
  import evaluate
13
  import random
14
  import bert_score
 
5
  import transformers
6
  import torch
7
  from datasets import load_from_disk
8
+ from sonicverse.model_utils import MultiTaskType
9
+ from sonicverse.training import ModelArguments
10
+ from sonicverse.inference import load_trained_lora_model
11
+ from sonicverse.data_tools import encode_chat
12
  import evaluate
13
  import random
14
  import bert_score
src/sonicverse/scripts/evaluate_model_mullama_musiccaps.py CHANGED
@@ -5,10 +5,10 @@ from flask import Flask, request, jsonify
5
  import transformers
6
  import torch
7
  from datasets import load_from_disk
8
- from multi_token.model_utils import MultiTaskType
9
- from multi_token.training import ModelArguments
10
- from multi_token.inference import load_trained_lora_model
11
- from multi_token.data_tools import encode_chat
12
  import evaluate
13
  import random
14
  import bert_score
 
5
  import transformers
6
  import torch
7
  from datasets import load_from_disk
8
+ from sonicverse.model_utils import MultiTaskType
9
+ from sonicverse.training import ModelArguments
10
+ from sonicverse.inference import load_trained_lora_model
11
+ from sonicverse.data_tools import encode_chat
12
  import evaluate
13
  import random
14
  import bert_score
src/sonicverse/scripts/evaluate_model_mullama_musiccaps_fixed_prompt.py CHANGED
@@ -4,10 +4,10 @@ from flask import Flask, request, jsonify
4
  import transformers
5
  import torch
6
  from datasets import load_from_disk
7
- from multi_token.model_utils import MultiTaskType
8
- from multi_token.training import ModelArguments
9
- from multi_token.inference import load_trained_lora_model
10
- from multi_token.data_tools import encode_chat
11
  import evaluate
12
  import random
13
  import bert_score
 
4
  import transformers
5
  import torch
6
  from datasets import load_from_disk
7
+ from sonicverse.model_utils import MultiTaskType
8
+ from sonicverse.training import ModelArguments
9
+ from sonicverse.inference import load_trained_lora_model
10
+ from sonicverse.data_tools import encode_chat
11
  import evaluate
12
  import random
13
  import bert_score
src/sonicverse/scripts/evaluate_mullama.py CHANGED
@@ -4,10 +4,10 @@ from flask import Flask, request, jsonify
4
  import transformers
5
  import torch
6
  from datasets import load_from_disk
7
- from multi_token.model_utils import MultiTaskType
8
- from multi_token.training import ModelArguments
9
- from multi_token.inference import load_trained_lora_model
10
- from multi_token.data_tools import encode_chat
11
  import evaluate
12
  import random
13
  import bert_score
 
4
  import transformers
5
  import torch
6
  from datasets import load_from_disk
7
+ from sonicverse.model_utils import MultiTaskType
8
+ from sonicverse.training import ModelArguments
9
+ from sonicverse.inference import load_trained_lora_model
10
+ from sonicverse.data_tools import encode_chat
11
  import evaluate
12
  import random
13
  import bert_score
src/sonicverse/scripts/evaluate_temp.py CHANGED
@@ -4,10 +4,10 @@ from flask import Flask, request, jsonify
4
  import transformers
5
  import torch
6
  from datasets import load_from_disk
7
- from multi_token.model_utils import MultiTaskType
8
- from multi_token.training import ModelArguments
9
- from multi_token.inference import load_trained_lora_model
10
- from multi_token.data_tools import encode_chat
11
  import evaluate
12
  import random
13
  import bert_score
 
4
  import transformers
5
  import torch
6
  from datasets import load_from_disk
7
+ from sonicverse.model_utils import MultiTaskType
8
+ from sonicverse.training import ModelArguments
9
+ from sonicverse.inference import load_trained_lora_model
10
+ from sonicverse.data_tools import encode_chat
11
  import evaluate
12
  import random
13
  import bert_score
src/sonicverse/scripts/gym_lunar_lander_build_dataset.py CHANGED
@@ -12,7 +12,7 @@ import torch.nn as nn
12
  import numpy as np
13
  import torch
14
 
15
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
16
 
17
  LUNAR_LANDER_OPTIONS = (
18
  "[FIRE LEFT ENGINE], [FIRE RIGHT ENGINE], [FIRE MAIN ENGINE], [NOTHING]".split(", ")
 
12
  import numpy as np
13
  import torch
14
 
15
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
16
 
17
  LUNAR_LANDER_OPTIONS = (
18
  "[FIRE LEFT ENGINE], [FIRE RIGHT ENGINE], [FIRE MAIN ENGINE], [NOTHING]".split(", ")
src/sonicverse/scripts/gym_lunar_lander_client.py CHANGED
@@ -6,7 +6,7 @@ import os
6
  from PIL import Image
7
  import gymnasium as gym
8
 
9
- from multi_token.constants import ROLE_USER
10
 
11
  LUNAR_LANDER_OPTIONS = (
12
  "[FIRE LEFT ENGINE], [FIRE RIGHT ENGINE], [FIRE MAIN ENGINE], [NOTHING]".split(", ")
 
6
  from PIL import Image
7
  import gymnasium as gym
8
 
9
+ from sonicverse.constants import ROLE_USER
10
 
11
  LUNAR_LANDER_OPTIONS = (
12
  "[FIRE LEFT ENGINE], [FIRE RIGHT ENGINE], [FIRE MAIN ENGINE], [NOTHING]".split(", ")
src/sonicverse/scripts/imagebind_build_llava_finetune_dataset.py CHANGED
@@ -6,7 +6,7 @@ import os
6
 
7
  from datasets import Dataset
8
 
9
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
10
 
11
 
12
  TYPES = ["audio", "image", "text"]
 
6
 
7
  from datasets import Dataset
8
 
9
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
10
 
11
 
12
  TYPES = ["audio", "image", "text"]
src/sonicverse/scripts/imagebind_build_llava_pretrain_dataset.py CHANGED
@@ -6,7 +6,7 @@ import os
6
 
7
  from datasets import Dataset
8
 
9
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
10
 
11
 
12
  TYPES = ["audio", "image", "text"]
 
6
 
7
  from datasets import Dataset
8
 
9
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
10
 
11
 
12
  TYPES = ["audio", "image", "text"]
src/sonicverse/scripts/llava_build_finetune_dataset.py CHANGED
@@ -5,7 +5,7 @@ import os
5
 
6
  from datasets import Dataset
7
 
8
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
9
 
10
 
11
  def _convert_convo(convo) -> List:
 
5
 
6
  from datasets import Dataset
7
 
8
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
9
 
10
 
11
  def _convert_convo(convo) -> List:
src/sonicverse/scripts/llava_build_pretrain_dataset.py CHANGED
@@ -5,7 +5,7 @@ import os
5
 
6
  from datasets import Dataset
7
 
8
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
9
 
10
 
11
  def _convert_convo(convo) -> List:
 
5
 
6
  from datasets import Dataset
7
 
8
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
9
 
10
 
11
  def _convert_convo(convo) -> List:
src/sonicverse/scripts/llava_gpt_build_multi_image_finetune_dataset.py CHANGED
@@ -7,7 +7,7 @@ import openai
7
 
8
  from datasets import Dataset, load_dataset
9
 
10
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
11
 
12
  PROMPT = """
13
  You are helping train a chat vision assistant that can take several image inputs and output text.
 
7
 
8
  from datasets import Dataset, load_dataset
9
 
10
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
11
 
12
  PROMPT = """
13
  You are helping train a chat vision assistant that can take several image inputs and output text.
src/sonicverse/scripts/serve_model.py CHANGED
@@ -5,12 +5,12 @@ from flask import Flask, request, jsonify
5
  import transformers
6
  import torch
7
 
8
- from multi_token.model_utils import MultiTaskType
9
- from multi_token.training import (
10
  ModelArguments,
11
  )
12
- from multi_token.inference import load_trained_lora_model
13
- from multi_token.data_tools import encode_chat
14
 
15
 
16
  @dataclass
 
5
  import transformers
6
  import torch
7
 
8
+ from sonicverse.model_utils import MultiTaskType
9
+ from sonicverse.training import (
10
  ModelArguments,
11
  )
12
+ from sonicverse.inference import load_trained_lora_model
13
+ from sonicverse.data_tools import encode_chat
14
 
15
 
16
  @dataclass
src/sonicverse/scripts/serve_model_gradio.py CHANGED
@@ -6,10 +6,10 @@ import torch
6
  import transformers
7
  import torchaudio
8
 
9
- from multi_token.model_utils import MultiTaskType
10
- from multi_token.training import ModelArguments
11
- from multi_token.inference import load_trained_lora_model
12
- from multi_token.data_tools import encode_chat
13
 
14
 
15
  @dataclass
 
6
  import transformers
7
  import torchaudio
8
 
9
+ from sonicverse.model_utils import MultiTaskType
10
+ from sonicverse.training import ModelArguments
11
+ from sonicverse.inference import load_trained_lora_model
12
+ from sonicverse.data_tools import encode_chat
13
 
14
 
15
  @dataclass
src/sonicverse/scripts/train_model.py CHANGED
@@ -1,20 +1,20 @@
1
  import transformers
2
  import logging
3
 
4
- from multi_token.training import (
5
  TrainingArguments,
6
  ModelArguments,
7
  train_for_modalities,
8
  )
9
- from multi_token.training_data import (
10
  DataArguments,
11
  TrainDataArguments,
12
  EvaluationDataArguments,
13
  )
14
 
15
- from multi_token.model_utils import MultiTaskType
16
- from multi_token.language_models import LANGUAGE_MODEL_NAME_TO_CLASS
17
- from multi_token.modalities import MODALITY_BUILDERS
18
 
19
  if __name__ == "__main__":
20
  logging.getLogger().setLevel(logging.INFO)
 
1
  import transformers
2
  import logging
3
 
4
+ from sonicverse.training import (
5
  TrainingArguments,
6
  ModelArguments,
7
  train_for_modalities,
8
  )
9
+ from sonicverse.training_data import (
10
  DataArguments,
11
  TrainDataArguments,
12
  EvaluationDataArguments,
13
  )
14
 
15
+ from sonicverse.model_utils import MultiTaskType
16
+ from sonicverse.language_models import LANGUAGE_MODEL_NAME_TO_CLASS
17
+ from sonicverse.modalities import MODALITY_BUILDERS
18
 
19
  if __name__ == "__main__":
20
  logging.getLogger().setLevel(logging.INFO)
src/sonicverse/scripts/whisper_build_pretrain_dataset.py CHANGED
@@ -5,7 +5,7 @@ import argparse
5
  from datasets import load_dataset
6
  from datasets import Dataset
7
 
8
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
9
 
10
  DATASET_ARGS = dict(
11
  path="mozilla-foundation/common_voice_15_0", name="en", split="train"
 
5
  from datasets import load_dataset
6
  from datasets import Dataset
7
 
8
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
9
 
10
  DATASET_ARGS = dict(
11
  path="mozilla-foundation/common_voice_15_0", name="en", split="train"
src/sonicverse/scripts/whisper_gpt_build_finetune_dataset.py CHANGED
@@ -7,7 +7,7 @@ import openai
7
 
8
  from datasets import Dataset, load_dataset
9
 
10
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
11
 
12
  DATASET_ARGS = dict(
13
  path="mozilla-foundation/common_voice_15_0", name="en", split="train"
 
7
 
8
  from datasets import Dataset, load_dataset
9
 
10
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
11
 
12
  DATASET_ARGS = dict(
13
  path="mozilla-foundation/common_voice_15_0", name="en", split="train"
src/sonicverse/scripts/xclip_build_finetune_dataset.py CHANGED
@@ -5,7 +5,7 @@ import json
5
 
6
  from datasets import Dataset, load_dataset
7
 
8
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
9
 
10
 
11
  def _write_convo(row) -> List:
 
5
 
6
  from datasets import Dataset, load_dataset
7
 
8
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
9
 
10
 
11
  def _write_convo(row) -> List:
src/sonicverse/scripts/xclip_build_pretrain_dataset.py CHANGED
@@ -6,7 +6,7 @@ import json
6
  from huggingface_hub import hf_hub_download
7
  from datasets import Dataset
8
 
9
- from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
10
 
11
  PRETRAIN_PHRASES = [
12
  "Repeat the content of the video <video>",
 
6
  from huggingface_hub import hf_hub_download
7
  from datasets import Dataset
8
 
9
+ from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER
10
 
11
  PRETRAIN_PHRASES = [
12
  "Repeat the content of the video <video>",
src/sonicverse/setup.py CHANGED
@@ -5,11 +5,11 @@ with open("requirements.txt") as f:
5
 
6
 
7
  setup(
8
- name="multi_token",
9
- version="0.0.4",
10
  description="",
11
- url="https://github.com/sshh12/multi_token",
12
- author="Shrivu Shankar",
13
  license="Apache License 2.0",
14
  packages=find_packages(),
15
  include_package_data=True,
 
5
 
6
 
7
  setup(
8
+ name="sonicverse",
9
+ version="1.0.0",
10
  description="",
11
+ url="https://github.com/amaai-lab/SonicVerse",
12
+ author="Anuradha Chopra",
13
  license="Apache License 2.0",
14
  packages=find_packages(),
15
  include_package_data=True,
src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/PKG-INFO RENAMED
@@ -1,6 +1,6 @@
1
  Metadata-Version: 2.1
2
- Name: multi-token
3
- Version: 0.0.4
4
- Home-page: https://github.com/sshh12/multi_token
5
- Author: Shrivu Shankar
6
  License: Apache License 2.0
 
1
  Metadata-Version: 2.1
2
+ Name: sonicverse
3
+ Version: 1.0.0
4
+ Home-page: https://github.com/amaai-lab/SonicVerse
5
+ Author: Anuradha Chopra
6
  License: Apache License 2.0
src/sonicverse/sonicverse.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ setup.py
2
+ sonicverse.egg-info/PKG-INFO
3
+ sonicverse.egg-info/SOURCES.txt
4
+ sonicverse.egg-info/dependency_links.txt
5
+ sonicverse.egg-info/requires.txt
6
+ sonicverse.egg-info/top_level.txt
src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/dependency_links.txt RENAMED
File without changes
src/sonicverse/sonicverse.egg-info/requires.txt ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.29.3
3
+ aiofiles==23.2.1
4
+ aiohttp==3.9.5
5
+ aiosignal==1.3.1
6
+ altair==5.5.0
7
+ anyio==4.9.0
8
+ argbind==0.3.9
9
+ asttokens==2.4.1
10
+ async-timeout==4.0.3
11
+ attrs==23.2.0
12
+ audioread==3.0.1
13
+ bert-score==0.3.13
14
+ bitsandbytes==0.43.1
15
+ blinker==1.8.2
16
+ certifi==2024.7.4
17
+ cffi==1.16.0
18
+ charset-normalizer==3.3.2
19
+ click==8.1.7
20
+ contourpy==1.2.1
21
+ cycler==0.12.1
22
+ datasets==2.19.0
23
+ decorator==5.1.1
24
+ deepspeed==0.9.5
25
+ descript-audio-codec==1.0.0
26
+ descript-audiotools==0.7.2
27
+ dill==0.3.8
28
+ docstring_parser==0.16
29
+ einops==0.8.0
30
+ evaluate==0.4.3
31
+ exceptiongroup==1.2.2
32
+ executing==2.0.1
33
+ fastapi==0.115.12
34
+ ffmpy==0.3.2
35
+ filelock==3.15.4
36
+ fire==0.6.0
37
+ Flask==3.0.3
38
+ flatten-dict==0.4.2
39
+ fonttools==4.53.1
40
+ frozenlist==1.4.1
41
+ fsspec==2024.3.1
42
+ future==1.0.0
43
+ gradio==3.50.2
44
+ gradio_client==0.6.1
45
+ graphviz==0.20.3
46
+ grpcio==1.64.1
47
+ h11==0.16.0
48
+ hjson==3.1.0
49
+ httpcore==1.0.9
50
+ httpx==0.28.1
51
+ huggingface-hub==0.23.4
52
+ idna==3.7
53
+ importlib_metadata==8.0.0
54
+ importlib_resources==6.4.0
55
+ ipython==8.18.1
56
+ itsdangerous==2.2.0
57
+ jedi==0.19.1
58
+ Jinja2==3.1.4
59
+ joblib==1.4.2
60
+ jsonschema==4.23.0
61
+ jsonschema-specifications==2025.4.1
62
+ julius==0.2.7
63
+ kiwisolver==1.4.5
64
+ lazy_loader==0.4
65
+ librosa==0.10.2.post1
66
+ llvmlite==0.43.0
67
+ Markdown==3.6
68
+ markdown-it-py==3.0.0
69
+ markdown2==2.5.0
70
+ MarkupSafe==2.1.5
71
+ matplotlib==3.9.1
72
+ matplotlib-inline==0.1.7
73
+ mdurl==0.1.2
74
+ mpmath==1.3.0
75
+ msgpack==1.0.8
76
+ multidict==6.0.5
77
+ git+https://huggingface.co/spaces/annabeth97c/temp#egg=multi_token&subdirectory=src/sonicverse
78
+ multiprocess==0.70.16
79
+ narwhals==1.40.0
80
+ networkx==3.2.1
81
+ ninja==1.11.1.1
82
+ nltk==3.8.1
83
+ numba==0.60.0
84
+ numpy==1.26.4
85
+ nvidia-cublas-cu12==12.1.3.1
86
+ nvidia-cuda-cupti-cu12==12.1.105
87
+ nvidia-cuda-nvrtc-cu12==12.1.105
88
+ nvidia-cuda-runtime-cu12==12.1.105
89
+ nvidia-cudnn-cu12==8.9.2.26
90
+ nvidia-cufft-cu12==11.0.2.54
91
+ nvidia-curand-cu12==10.3.2.106
92
+ nvidia-cusolver-cu12==11.4.5.107
93
+ nvidia-cusparse-cu12==12.1.0.106
94
+ nvidia-nccl-cu12==2.20.5
95
+ nvidia-nvjitlink-cu12==12.5.82
96
+ nvidia-nvtx-cu12==12.1.105
97
+ openai==1.82.0
98
+ orjson==3.10.18
99
+ packaging==24.1
100
+ pandas==2.2.2
101
+ parso==0.8.4
102
+ peft==0.10.0
103
+ pexpect==4.9.0
104
+ pillow==10.3.0
105
+ platformdirs==4.2.2
106
+ pooch==1.8.2
107
+ prompt_toolkit==3.0.47
108
+ protobuf==3.19.6
109
+ psutil==6.0.0
110
+ ptyprocess==0.7.0
111
+ pure-eval==0.2.2
112
+ py-cpuinfo==9.0.0
113
+ pyarrow==16.1.0
114
+ pyarrow-hotfix==0.6
115
+ pycparser==2.22
116
+ pydantic==1.10.17
117
+ pydub==0.25.1
118
+ Pygments==2.18.0
119
+ pyloudnorm==0.1.1
120
+ pyparsing==3.1.2
121
+ pystoi==0.4.1
122
+ python-dateutil==2.9.0.post0
123
+ python-multipart==0.0.20
124
+ pytz==2024.1
125
+ PyYAML==6.0.1
126
+ randomname==0.2.1
127
+ referencing==0.36.2
128
+ regex==2024.5.15
129
+ requests==2.32.3
130
+ rich==13.7.1
131
+ rouge_score==0.1.2
132
+ rpds-py==0.25.0
133
+ safetensors==0.4.3
134
+ scikit-learn==1.5.1
135
+ scipy==1.13.0
136
+ semantic-version==2.10.0
137
+ sentencepiece==0.2.0
138
+ six==1.16.0
139
+ sniffio==1.3.1
140
+ soundfile==0.12.1
141
+ soxr==0.3.7
142
+ stack-data==0.6.3
143
+ starlette==0.46.2
144
+ sympy==1.13.0
145
+ tensorboard==2.17.0
146
+ tensorboard-data-server==0.7.2
147
+ termcolor==2.4.0
148
+ threadpoolctl==3.5.0
149
+ tokenizers==0.19.1
150
+ torch==2.3.1
151
+ torch-stoi==0.2.1
152
+ torchaudio==2.3.1
153
+ torchviz==0.0.2
154
+ tqdm==4.66.4
155
+ traitlets==5.14.3
156
+ transformers==4.40.1
157
+ triton==2.3.1
158
+ typing_extensions==4.12.2
159
+ tzdata==2024.1
160
+ urllib3==2.2.2
161
+ uvicorn==0.34.2
162
+ wcwidth==0.2.13
163
+ websockets==11.0.3
164
+ Werkzeug==3.0.3
165
+ xxhash==3.4.1
166
+ yarl==1.9.4
167
+ zipp==3.19.2
src/sonicverse/{multi_token.egg-info → sonicverse.egg-info}/top_level.txt RENAMED
File without changes
src/sonicverse/{multi_token → sonicverse}/constants.py RENAMED
File without changes
src/sonicverse/{multi_token → sonicverse}/data_tools.py RENAMED
@@ -18,7 +18,7 @@ import numpy as np
18
  from datasets import load_dataset, Dataset
19
  from PIL import Image
20
 
21
- from multi_token.constants import IGNORE_INDEX
22
 
23
 
24
  def encode_chat(
 
18
  from datasets import load_dataset, Dataset
19
  from PIL import Image
20
 
21
+ from sonicverse.constants import IGNORE_INDEX
22
 
23
 
24
  def encode_chat(
src/sonicverse/{multi_token → sonicverse}/inference.py RENAMED
@@ -7,11 +7,11 @@ from peft import PeftModel
7
  import torch
8
  import os
9
 
10
- from multi_token.model_utils import fix_tokenizer, MultiTaskType
11
- from multi_token.modalities.base_modality import Modality
12
- from multi_token.language_models.mistral import MistralForCausalLM
13
- from multi_token.language_models import LANGUAGE_MODEL_NAME_TO_CLASS
14
- from multi_token.modalities import MODALITY_BUILDERS
15
 
16
 
17
  def load_trained_lora_model(
 
7
  import torch
8
  import os
9
 
10
+ from sonicverse.model_utils import fix_tokenizer, MultiTaskType
11
+ from sonicverse.modalities.base_modality import Modality
12
+ from sonicverse.language_models.mistral import MistralForCausalLM
13
+ from sonicverse.language_models import LANGUAGE_MODEL_NAME_TO_CLASS
14
+ from sonicverse.modalities import MODALITY_BUILDERS
15
 
16
 
17
  def load_trained_lora_model(
src/sonicverse/{multi_token → sonicverse}/language_models/__init__.py RENAMED
@@ -1,4 +1,4 @@
1
- from multi_token.language_models.mistral import (
2
  MistralLMMForCausalLM,
3
  )
4
 
 
1
+ from sonicverse.language_models.mistral import (
2
  MistralLMMForCausalLM,
3
  )
4
 
src/sonicverse/{multi_token → sonicverse}/language_models/base_model.py RENAMED
@@ -5,8 +5,8 @@ from torch.nn.functional import conv1d
5
  import torch
6
  import logging
7
 
8
- from multi_token.modalities.base_modality import Modality
9
- from multi_token.model_utils import MultiTaskType
10
 
11
  from torchviz import make_dot
12
 
 
5
  import torch
6
  import logging
7
 
8
+ from sonicverse.modalities.base_modality import Modality
9
+ from sonicverse.model_utils import MultiTaskType
10
 
11
  from torchviz import make_dot
12
 
src/sonicverse/{multi_token → sonicverse}/language_models/mistral.py RENAMED
@@ -15,7 +15,7 @@ from transformers import (
15
 
16
  from transformers.modeling_outputs import CausalLMOutputWithPast
17
 
18
- from multi_token.language_models.base_model import (
19
  LMMMetaModel,
20
  LMMMetaForCausalLM,
21
  )
 
15
 
16
  from transformers.modeling_outputs import CausalLMOutputWithPast
17
 
18
+ from sonicverse.language_models.base_model import (
19
  LMMMetaModel,
20
  LMMMetaForCausalLM,
21
  )
src/sonicverse/{multi_token → sonicverse}/modalities/__init__.py RENAMED
@@ -1,15 +1,15 @@
1
- from multi_token.model_utils import MultiTaskType
2
- from multi_token.modalities.vision_clip import (
3
  CLIPVisionModality,
4
  OUTPUT_LAYER as CLIP_POOL_LAYER,
5
  )
6
- from multi_token.modalities.imagebind import ImageBindModality
7
- from multi_token.modalities.document_gte import DocumentGTEModality
8
- from multi_token.modalities.audio_whisper import WhisperAudioModality
9
- from multi_token.modalities.audio_clap import CLAPAudioModality
10
- from multi_token.modalities.video_xclip import XCLIPVideoModality
11
- from multi_token.modalities.audio_descript import DescriptAudioModality
12
- from multi_token.modalities.audio_mert import MERTAudioModality
13
 
14
  MODALITY_BUILDERS = {
15
  "vision_clip": lambda: [CLIPVisionModality()],
 
1
+ from sonicverse.model_utils import MultiTaskType
2
+ from sonicverse.modalities.vision_clip import (
3
  CLIPVisionModality,
4
  OUTPUT_LAYER as CLIP_POOL_LAYER,
5
  )
6
+ from sonicverse.modalities.imagebind import ImageBindModality
7
+ from sonicverse.modalities.document_gte import DocumentGTEModality
8
+ from sonicverse.modalities.audio_whisper import WhisperAudioModality
9
+ from sonicverse.modalities.audio_clap import CLAPAudioModality
10
+ from sonicverse.modalities.video_xclip import XCLIPVideoModality
11
+ from sonicverse.modalities.audio_descript import DescriptAudioModality
12
+ from sonicverse.modalities.audio_mert import MERTAudioModality
13
 
14
  MODALITY_BUILDERS = {
15
  "vision_clip": lambda: [CLIPVisionModality()],
src/sonicverse/{multi_token → sonicverse}/modalities/audio_clap.py RENAMED
@@ -4,10 +4,10 @@ import torch
4
  import torch.nn as nn
5
  from transformers import ClapModel, ClapProcessor
6
 
7
- from multi_token.model_utils import MultiTaskType
8
- from multi_token.data_tools import load_audio
9
- from multi_token.modalities.base_modality import Modality
10
- from multi_token.modalities.projectors import (
11
  build_mlp_vector_projector, build_mt_vector_projector, MultiTaskModel
12
  )
13
 
 
4
  import torch.nn as nn
5
  from transformers import ClapModel, ClapProcessor
6
 
7
+ from sonicverse.model_utils import MultiTaskType
8
+ from sonicverse.data_tools import load_audio
9
+ from sonicverse.modalities.base_modality import Modality
10
+ from sonicverse.modalities.projectors import (
11
  build_mlp_vector_projector, build_mt_vector_projector, MultiTaskModel
12
  )
13
 
src/sonicverse/{multi_token → sonicverse}/modalities/audio_descript.py RENAMED
@@ -5,10 +5,10 @@ import torch.nn as nn
5
  import dac
6
  from audiotools import AudioSignal
7
 
8
- from multi_token.model_utils import MultiTaskType
9
- from multi_token.data_tools import load_audio_signal
10
- from multi_token.modalities.base_modality import Modality
11
- from multi_token.modalities.projectors import (
12
  build_mlp_vector_projector, build_attentive_cnn_projector, build_cnn_mlp_projector, MultiTaskModel
13
  )
14
 
 
5
  import dac
6
  from audiotools import AudioSignal
7
 
8
+ from sonicverse.model_utils import MultiTaskType
9
+ from sonicverse.data_tools import load_audio_signal
10
+ from sonicverse.modalities.base_modality import Modality
11
+ from sonicverse.modalities.projectors import (
12
  build_mlp_vector_projector, build_attentive_cnn_projector, build_cnn_mlp_projector, MultiTaskModel
13
  )
14
 
src/sonicverse/{multi_token → sonicverse}/modalities/audio_descript_bu.py RENAMED
@@ -6,9 +6,9 @@ import dac
6
  from audiotools import AudioSignal
7
 
8
 
9
- from multi_token.data_tools import load_audio_signal
10
- from multi_token.modalities.base_modality import Modality
11
- from multi_token.modalities.projectors import (
12
  build_mlp_vector_projector, build_attentive_cnn_projector, build_cnn_mlp_projector
13
  )
14
 
 
6
  from audiotools import AudioSignal
7
 
8
 
9
+ from sonicverse.data_tools import load_audio_signal
10
+ from sonicverse.modalities.base_modality import Modality
11
+ from sonicverse.modalities.projectors import (
12
  build_mlp_vector_projector, build_attentive_cnn_projector, build_cnn_mlp_projector
13
  )
14
 
src/sonicverse/{multi_token → sonicverse}/modalities/audio_mert.py RENAMED
@@ -4,13 +4,13 @@ import torch
4
  import torch.nn as nn
5
  from transformers import Wav2Vec2FeatureExtractor, AutoModel
6
 
7
- from multi_token.model_utils import MultiTaskType
8
- from multi_token.data_tools import load_audio
9
- from multi_token.modalities.base_modality import Modality
10
- from multi_token.modalities.projectors import (
11
  build_mlp_vector_projector, build_mt_vector_projector, build_multi_layer_cnn_mlp_projector, MultiTaskModel
12
  )
13
- from multi_token.modalities.multi_task_projector_shared import MultiTaskSharedModel
14
 
15
  import json
16
 
 
4
  import torch.nn as nn
5
  from transformers import Wav2Vec2FeatureExtractor, AutoModel
6
 
7
+ from sonicverse.model_utils import MultiTaskType
8
+ from sonicverse.data_tools import load_audio
9
+ from sonicverse.modalities.base_modality import Modality
10
+ from sonicverse.modalities.projectors import (
11
  build_mlp_vector_projector, build_mt_vector_projector, build_multi_layer_cnn_mlp_projector, MultiTaskModel
12
  )
13
+ from sonicverse.modalities.multi_task_projector_shared import MultiTaskSharedModel
14
 
15
  import json
16
 
src/sonicverse/{multi_token → sonicverse}/modalities/audio_mert_bu.py RENAMED
@@ -4,10 +4,10 @@ import torch
4
  import torch.nn as nn
5
  from transformers import Wav2Vec2FeatureExtractor, AutoModel
6
 
7
- from multi_token.model_utils import MultiTaskType
8
- from multi_token.data_tools import load_audio
9
- from multi_token.modalities.base_modality import Modality
10
- from multi_token.modalities.projectors import (
11
  build_mlp_vector_projector, build_mt_vector_projector, build_multi_layer_cnn_mlp_projector, MultiTaskModel
12
  )
13
 
 
4
  import torch.nn as nn
5
  from transformers import Wav2Vec2FeatureExtractor, AutoModel
6
 
7
+ from sonicverse.model_utils import MultiTaskType
8
+ from sonicverse.data_tools import load_audio
9
+ from sonicverse.modalities.base_modality import Modality
10
+ from sonicverse.modalities.projectors import (
11
  build_mlp_vector_projector, build_mt_vector_projector, build_multi_layer_cnn_mlp_projector, MultiTaskModel
12
  )
13
 
src/sonicverse/{multi_token → sonicverse}/modalities/audio_whisper.py RENAMED
@@ -4,9 +4,9 @@ import torch
4
  import torch.nn as nn
5
  from transformers import AutoFeatureExtractor, WhisperModel
6
 
7
- from multi_token.data_tools import load_audio
8
- from multi_token.modalities.base_modality import Modality
9
- from multi_token.modalities.projectors import (
10
  build_mlp_vector_projector,
11
  )
12
 
 
4
  import torch.nn as nn
5
  from transformers import AutoFeatureExtractor, WhisperModel
6
 
7
+ from sonicverse.data_tools import load_audio
8
+ from sonicverse.modalities.base_modality import Modality
9
+ from sonicverse.modalities.projectors import (
10
  build_mlp_vector_projector,
11
  )
12