diff --git a/.gitattributes b/.gitattributes index 502cfe46efc5b98be4b3ef557f033bd590d9fdca..45830bb2ea7c14d5472a6f43889c9723f3b1b880 100644 --- a/.gitattributes +++ b/.gitattributes @@ -44,3 +44,5 @@ models/pvq_extractor/Resonance.onnx filter=lfs diff=lfs merge=lfs -text models/pvq_extractor/Weight.onnx filter=lfs diff=lfs merge=lfs -text models/norm_flow/model.pt filter=lfs diff=lfs merge=lfs -text audio/1034_121119_000028_000001.wav filter=lfs diff=lfs merge=lfs -text +Dataset/Audio_files/*.wav filter=lfs diff=lfs merge=lfs -text +Dataset/Embeddings/**/*.pth filter=lfs diff=lfs merge=lfs -text diff --git a/Dataset/Audio_files/1034_121119_000028_000001.wav b/Dataset/Audio_files/1034_121119_000028_000001.wav new file mode 100644 index 0000000000000000000000000000000000000000..2dbcd7cf1a0fb50ce0d607d5db1a86f995734fff --- /dev/null +++ b/Dataset/Audio_files/1034_121119_000028_000001.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc00c4e893ccf708cae4366e36ede93b4e158f516323a0724fc6e9f956c76aff +size 385964 diff --git a/Dataset/Audio_files/1088_129236_000006_000007.wav b/Dataset/Audio_files/1088_129236_000006_000007.wav new file mode 100644 index 0000000000000000000000000000000000000000..e6d4df40f936d8a012e04e648f4b2d8890a2b440 --- /dev/null +++ b/Dataset/Audio_files/1088_129236_000006_000007.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27b2e7191ba1cfad41bc1ab1bd09ec1af87062e48abbab1ef01809c76ed738da +size 311084 diff --git a/Dataset/Audio_files/1422_149735_000006_000000.wav b/Dataset/Audio_files/1422_149735_000006_000000.wav new file mode 100644 index 0000000000000000000000000000000000000000..5cefd0c6b0c913b8bb42631477775b5c638f8ad4 --- /dev/null +++ b/Dataset/Audio_files/1422_149735_000006_000000.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f59d7f7a3c7364d7ac254bd94d3384e9b8e173634eb8b7492ec751d8584f8bb5 +size 345644 diff --git a/Dataset/Audio_files/14_212_000019_000000.wav b/Dataset/Audio_files/14_212_000019_000000.wav new file mode 100644 index 0000000000000000000000000000000000000000..3337debbd72602bedb653eaaf1027653a3ee4485 --- /dev/null +++ b/Dataset/Audio_files/14_212_000019_000000.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1271f49cf4855d1b7d9b87e99a0c79e5505acbfba94cd8f594c1df2a29d96027 +size 633652 diff --git a/Dataset/Audio_files/1535_141644_000004_000001.wav b/Dataset/Audio_files/1535_141644_000004_000001.wav new file mode 100644 index 0000000000000000000000000000000000000000..54f34cd3adeabd635eca3a2d12e616cb5dbfdac8 --- /dev/null +++ b/Dataset/Audio_files/1535_141644_000004_000001.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cc6f0bc3b9ebecbc1dab5a430c37140337a6bbeaf6f75103d74b2b4e75b4f06 +size 295724 diff --git a/Dataset/Audio_files/1731_142320_000122_000005.wav b/Dataset/Audio_files/1731_142320_000122_000005.wav new file mode 100644 index 0000000000000000000000000000000000000000..7005b27d4ee4863f4aa20d0814bb369565e32d7f --- /dev/null +++ b/Dataset/Audio_files/1731_142320_000122_000005.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3d3e88bc53ee1cad73100ea4ad6ccc6d9bcbc36145962d400122b658e27b7e8 +size 316844 diff --git a/Dataset/Audio_files/3009_10327_000027_000005.wav b/Dataset/Audio_files/3009_10327_000027_000005.wav new file mode 100644 index 0000000000000000000000000000000000000000..137488422f1ecb73e27a98296ba184bad68ce7e7 --- /dev/null +++ b/Dataset/Audio_files/3009_10327_000027_000005.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe46722ef2d331a5bc1c552cd6ad3c8a69022a3c70b1c03b609856dc073ca32 +size 309164 diff --git a/Dataset/Audio_files/329_861_000024_000003.wav b/Dataset/Audio_files/329_861_000024_000003.wav new file mode 100644 index 0000000000000000000000000000000000000000..0ecc6b2656a0ebac19680a94c156597ac20abf5a --- /dev/null +++ b/Dataset/Audio_files/329_861_000024_000003.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dab66421315b9f22bbbaf909e69184c01eaba29e536c2b449c8a7310f2edce7 +size 261164 diff --git a/Dataset/Audio_files/4830_25904_000008_000001.wav b/Dataset/Audio_files/4830_25904_000008_000001.wav new file mode 100644 index 0000000000000000000000000000000000000000..5bcb611ee93af1f3e1cddd58e6f121cf110a6311 --- /dev/null +++ b/Dataset/Audio_files/4830_25904_000008_000001.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd41aaf86c9d6e394d9afcca5e3128aa6a52fd2948e3bcf6aa03e5c18f2c7eec +size 483884 diff --git a/Dataset/Audio_files/4957_30119_000070_000001.wav b/Dataset/Audio_files/4957_30119_000070_000001.wav new file mode 100644 index 0000000000000000000000000000000000000000..50f51c378a8c42aacde6bfa8b0c1c606ea4f87e4 --- /dev/null +++ b/Dataset/Audio_files/4957_30119_000070_000001.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f483b0a7003610ba8451db035f3347b156bb348c7aa356b7403f8ca86b98ab28 +size 503084 diff --git a/Dataset/Audio_files/5012_80192_000020_000003.wav b/Dataset/Audio_files/5012_80192_000020_000003.wav new file mode 100644 index 0000000000000000000000000000000000000000..feadaad91006c8fe5bbd2633e95e1a1c842e825c --- /dev/null +++ b/Dataset/Audio_files/5012_80192_000020_000003.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91aa9243a6697d65e6f5464b40e9b420b5e5cdef83b64a5556baef1ac548f11e +size 409004 diff --git a/Dataset/Audio_files/5802_76044_000038_000000.wav b/Dataset/Audio_files/5802_76044_000038_000000.wav new file mode 100644 index 0000000000000000000000000000000000000000..eeff983a657840379c7308bb494a3c83d8e5ac09 --- /dev/null +++ b/Dataset/Audio_files/5802_76044_000038_000000.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22964325ee8f751dddd136b3219191443270529d95ee27b45c4a789501286492 +size 460844 diff --git a/Dataset/Audio_files/6544_71420_000024_000001.wav b/Dataset/Audio_files/6544_71420_000024_000001.wav new file mode 100644 index 0000000000000000000000000000000000000000..445645892dbaeedd2183bda1960299b0a06c04dd --- /dev/null +++ b/Dataset/Audio_files/6544_71420_000024_000001.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1289b6714acb263b8bb36d6acfbb4efded0a5c67cc9b6a6246340dd3493c6c2b +size 209324 diff --git a/Dataset/Audio_files/6918_47541_000006_000008.wav b/Dataset/Audio_files/6918_47541_000006_000008.wav new file mode 100644 index 0000000000000000000000000000000000000000..6b4d2bae883e73a78d4f1d565c7604e4f0a7aee9 --- /dev/null +++ b/Dataset/Audio_files/6918_47541_000006_000008.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dcd7955f5469755038482a58f0929012526f98130513acd9d0cd1bc208bbfe8 +size 898612 diff --git a/Dataset/Audio_files/7011_66622_000032_000002.wav b/Dataset/Audio_files/7011_66622_000032_000002.wav new file mode 100644 index 0000000000000000000000000000000000000000..c0b6fc90f49c7e80429361145d9eee72b9831a3c --- /dev/null +++ b/Dataset/Audio_files/7011_66622_000032_000002.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:111c88a591efa42a608d1609214e6ef56a64f3bd79a88b57efecba2ca2f7ed4c +size 309164 diff --git a/Dataset/Audio_files/7059_77897_000017_000001.wav b/Dataset/Audio_files/7059_77897_000017_000001.wav new file mode 100644 index 0000000000000000000000000000000000000000..aa7dbaa9ad91035afc72e51c6a9740bfd7a3317e --- /dev/null +++ b/Dataset/Audio_files/7059_77897_000017_000001.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17f51a3d2133e81607e36403b6bcb8bd7ec9e03c1bcfbbc80b4123c1b31d6618 +size 243884 diff --git a/Dataset/Audio_files/7190_90542_000054_000000.wav b/Dataset/Audio_files/7190_90542_000054_000000.wav new file mode 100644 index 0000000000000000000000000000000000000000..cd57aa42781ce78c33bd85d441752713331c8618 --- /dev/null +++ b/Dataset/Audio_files/7190_90542_000054_000000.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34bdcfbdf51f3475465804b9dbf27f8e647ccc1af17573b0a923f44881217093 +size 222764 diff --git a/Dataset/Audio_files/7226_86965_000020_000001.wav b/Dataset/Audio_files/7226_86965_000020_000001.wav new file mode 100644 index 0000000000000000000000000000000000000000..3ae60af52a50d25a871d54df5372c8047468b7d4 --- /dev/null +++ b/Dataset/Audio_files/7226_86965_000020_000001.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b824a36ceaeec45724088957b1e543ee3b477ca1ee55e4c55e96ac8c2b018fb5 +size 622132 diff --git a/Dataset/Audio_files/7245_104888_000016_000000.wav b/Dataset/Audio_files/7245_104888_000016_000000.wav new file mode 100644 index 0000000000000000000000000000000000000000..34777d6a44ac3d1cb7a68df6cf6f2b5c3964529f --- /dev/null +++ b/Dataset/Audio_files/7245_104888_000016_000000.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92d74fe5965fff3182cb1f273c80ea051033c7aa0dbbb44ae48ccded15210216 +size 341804 diff --git a/Dataset/Audio_files/83_9960_000017_000003.wav b/Dataset/Audio_files/83_9960_000017_000003.wav new file mode 100644 index 0000000000000000000000000000000000000000..9bf31f59261ea47beb21302d7ec7d7ad92a41d72 --- /dev/null +++ b/Dataset/Audio_files/83_9960_000017_000003.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1cdc21d779c1d108af86ec6a93558a501322a67c221c25e2dd32d93e0c356a +size 192044 diff --git a/Dataset/Audio_files/8758_296465_000020_000000.wav b/Dataset/Audio_files/8758_296465_000020_000000.wav new file mode 100644 index 0000000000000000000000000000000000000000..1ab3f0ed6136e2c9922698ced2d7bce1dc8a4208 --- /dev/null +++ b/Dataset/Audio_files/8758_296465_000020_000000.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8adafe1ab7b3e86c82454c06863dd616c5b52f91ebb8690fcc64ec7abb2821dc +size 520364 diff --git a/Dataset/Audio_files/8820_294120_000011_000001.wav b/Dataset/Audio_files/8820_294120_000011_000001.wav new file mode 100644 index 0000000000000000000000000000000000000000..2b99764656e794b415e2783948151d69ee5234ad --- /dev/null +++ b/Dataset/Audio_files/8820_294120_000011_000001.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78d6b2aed1a99e8f3750bd54c50e8ed2e08dba114792fa604101faf27894708 +size 213164 diff --git a/Dataset/Embeddings/1034/1034_121119_000028_000001.pth b/Dataset/Embeddings/1034/1034_121119_000028_000001.pth new file mode 100644 index 0000000000000000000000000000000000000000..7795dfb5787b650c6bba451c7054375ed0728cc3 --- /dev/null +++ b/Dataset/Embeddings/1034/1034_121119_000028_000001.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4a7bdd020bf0da6fb08d272448c8b61c6f065e529084ce1cf9c39c1636e017c +size 2358 diff --git a/Dataset/Embeddings/1088/1088_129236_000006_000007.pth b/Dataset/Embeddings/1088/1088_129236_000006_000007.pth new file mode 100644 index 0000000000000000000000000000000000000000..e75973ec870e0f63360af27cc1fff5c70d6e9c9b --- /dev/null +++ b/Dataset/Embeddings/1088/1088_129236_000006_000007.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1544023ea8afb9b0c71fa31e1e16d2ec510cf9d8637a64648941448c9e5e18ae +size 2358 diff --git a/Dataset/Embeddings/14/14_212_000019_000000.pth b/Dataset/Embeddings/14/14_212_000019_000000.pth new file mode 100644 index 0000000000000000000000000000000000000000..b93e910c25c06b7635960a28295a32bfb5e950c4 --- /dev/null +++ b/Dataset/Embeddings/14/14_212_000019_000000.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f248135ffdacc81ef4b5071f564448d49c2341b5c5c14bf4257af633f9318fd +size 2269 diff --git a/Dataset/Embeddings/1422/1422_149735_000006_000000.pth b/Dataset/Embeddings/1422/1422_149735_000006_000000.pth new file mode 100644 index 0000000000000000000000000000000000000000..2413583f5360843ebeadb63754b1e065e0b96574 --- /dev/null +++ b/Dataset/Embeddings/1422/1422_149735_000006_000000.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fee0b79857cf8ce499a658dfeb5137d5b4fa7e849dd8118c32028391b88b3d08 +size 2358 diff --git a/Dataset/Embeddings/1535/1535_141644_000004_000001.pth b/Dataset/Embeddings/1535/1535_141644_000004_000001.pth new file mode 100644 index 0000000000000000000000000000000000000000..23a9496bc51c51c672a3a99d69485d507290447b --- /dev/null +++ b/Dataset/Embeddings/1535/1535_141644_000004_000001.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d71a10862f81b5a3f0877f6eb26cd4bc733e0cc9868acc65a65bb23ffe304b9 +size 2358 diff --git a/Dataset/Embeddings/1731/1731_142320_000122_000005.pth b/Dataset/Embeddings/1731/1731_142320_000122_000005.pth new file mode 100644 index 0000000000000000000000000000000000000000..77049b81d396b2f4cf6254b90b16591df920d5eb --- /dev/null +++ b/Dataset/Embeddings/1731/1731_142320_000122_000005.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f2de5cb40487fe19b5099b57fe0a41f0436f554019c619bcb4cd9d6c64bf36 +size 2358 diff --git a/Dataset/Embeddings/3009/3009_10327_000027_000005.pth b/Dataset/Embeddings/3009/3009_10327_000027_000005.pth new file mode 100644 index 0000000000000000000000000000000000000000..39d608dcb78198dff105ad64c41e680593b170d8 --- /dev/null +++ b/Dataset/Embeddings/3009/3009_10327_000027_000005.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f24187ad06ecbe02df165538c6881192cfd055b5a3cc5ab1348d2c05d6567421 +size 2353 diff --git a/Dataset/Embeddings/329/329_861_000024_000003.pth b/Dataset/Embeddings/329/329_861_000024_000003.pth new file mode 100644 index 0000000000000000000000000000000000000000..9f37d896c6a6a19991d35a61645640f0334de7fb --- /dev/null +++ b/Dataset/Embeddings/329/329_861_000024_000003.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc001f48b3f67d25192967e0a297dc1787144e36222e5b83a71ae6f5b89be9b3 +size 2274 diff --git a/Dataset/Embeddings/4830/4830_25904_000008_000001.pth b/Dataset/Embeddings/4830/4830_25904_000008_000001.pth new file mode 100644 index 0000000000000000000000000000000000000000..f98510c0837e3059333194f2f26a8429a5f7b767 --- /dev/null +++ b/Dataset/Embeddings/4830/4830_25904_000008_000001.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:170bdd680d0735a19c5e88e01fc8bf84dac623d7c73eebeff6e99974b8e9d081 +size 2353 diff --git a/Dataset/Embeddings/4957/4957_30119_000070_000001.pth b/Dataset/Embeddings/4957/4957_30119_000070_000001.pth new file mode 100644 index 0000000000000000000000000000000000000000..32cce89dacb7e3816a1a4155139d21a773586dab --- /dev/null +++ b/Dataset/Embeddings/4957/4957_30119_000070_000001.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c922316a446bcc28db8a43f768ade2b2113ce0f6fab24b60b396f67264ce07c8 +size 2353 diff --git a/Dataset/Embeddings/5012/5012_80192_000020_000003.pth b/Dataset/Embeddings/5012/5012_80192_000020_000003.pth new file mode 100644 index 0000000000000000000000000000000000000000..abef6da452c3919155b1e43ae02cf2a2a6f293a3 --- /dev/null +++ b/Dataset/Embeddings/5012/5012_80192_000020_000003.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba1c17f5100b1e0147e9c96d864cc054e8840a15cd46307e191fbe88a728b1b0 +size 2353 diff --git a/Dataset/Embeddings/5802/5802_76044_000038_000000.pth b/Dataset/Embeddings/5802/5802_76044_000038_000000.pth new file mode 100644 index 0000000000000000000000000000000000000000..754af078eecea4c8cf77c7cd3383dd25a0b46ae1 --- /dev/null +++ b/Dataset/Embeddings/5802/5802_76044_000038_000000.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7656515f537fa6de193f40d78c9747cfb1268266d3dd88a22a41ce2c3a28514a +size 2353 diff --git a/Dataset/Embeddings/6544/6544_71420_000024_000001.pth b/Dataset/Embeddings/6544/6544_71420_000024_000001.pth new file mode 100644 index 0000000000000000000000000000000000000000..a5fc18d0832f940a61928bf7de9203bcbe65a762 --- /dev/null +++ b/Dataset/Embeddings/6544/6544_71420_000024_000001.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f70c9bd92dea6ddfd495c7ab32cae30494eaf3b42f6d6533ff9f55de80593f05 +size 2353 diff --git a/Dataset/Embeddings/6918/6918_47541_000006_000008.pth b/Dataset/Embeddings/6918/6918_47541_000006_000008.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c589adf65cdc87082d0aeef1a71f394102321fb --- /dev/null +++ b/Dataset/Embeddings/6918/6918_47541_000006_000008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed49a76c226606f98ce4c2db2aac937354e40cc8fb789e29e93aa87f64bc01d1 +size 2353 diff --git a/Dataset/Embeddings/7011/7011_66622_000032_000002.pth b/Dataset/Embeddings/7011/7011_66622_000032_000002.pth new file mode 100644 index 0000000000000000000000000000000000000000..1321781bee6ce4113d1622eddbde0fdb48a762d2 --- /dev/null +++ b/Dataset/Embeddings/7011/7011_66622_000032_000002.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43e63641af7d4322b89489acb9c10cfc7e71961bd6479c55c17135b3ecfa5605 +size 2353 diff --git a/Dataset/Embeddings/7059/7059_77897_000017_000001.pth b/Dataset/Embeddings/7059/7059_77897_000017_000001.pth new file mode 100644 index 0000000000000000000000000000000000000000..0ba1d2ba5edc31a587b5f06f26ea4240520fa7ad --- /dev/null +++ b/Dataset/Embeddings/7059/7059_77897_000017_000001.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:688e50692694cccbe5f61c8780e0980509118f4061a44180ec8dffff2d963921 +size 2353 diff --git a/Dataset/Embeddings/7190/7190_90542_000054_000000.pth b/Dataset/Embeddings/7190/7190_90542_000054_000000.pth new file mode 100644 index 0000000000000000000000000000000000000000..af82be283e6c9099f3c977aea25f51a1750b09a6 --- /dev/null +++ b/Dataset/Embeddings/7190/7190_90542_000054_000000.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f1925fcb8ce5ffa8b9223de17ea8d98c0abb24409852208f03c607374c9f60a +size 2353 diff --git a/Dataset/Embeddings/7226/7226_86965_000020_000001.pth b/Dataset/Embeddings/7226/7226_86965_000020_000001.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a06fc4b5c2a4dc16b43519691dd6e7eeb17063c --- /dev/null +++ b/Dataset/Embeddings/7226/7226_86965_000020_000001.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f88a82eee39139ab65f3b201f2657b795ad66d70ccd637f903d537df2acaca0 +size 2353 diff --git a/Dataset/Embeddings/7245/7245_104888_000016_000000.pth b/Dataset/Embeddings/7245/7245_104888_000016_000000.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ad02e597904100f2b98182e08c0de1e7e286859 --- /dev/null +++ b/Dataset/Embeddings/7245/7245_104888_000016_000000.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db50270eb45aab4344720a1da44d3c9d91ace10e69514287b3174ba9c2ca208a +size 2358 diff --git a/Dataset/Embeddings/83/83_9960_000017_000003.pth b/Dataset/Embeddings/83/83_9960_000017_000003.pth new file mode 100644 index 0000000000000000000000000000000000000000..65db33e3380a81f44a80a697b4f21d477b757b04 --- /dev/null +++ b/Dataset/Embeddings/83/83_9960_000017_000003.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:958832b7e4e77f6eb8343b91091c8603b683b25c03f242e6de4b09952a0fba6d +size 2274 diff --git a/Dataset/Embeddings/8758/8758_296465_000020_000000.pth b/Dataset/Embeddings/8758/8758_296465_000020_000000.pth new file mode 100644 index 0000000000000000000000000000000000000000..a12bb487870216418045b17cc16623a85c58b9ae --- /dev/null +++ b/Dataset/Embeddings/8758/8758_296465_000020_000000.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:465be53ae1d0a44ccdb90e0fcaccf09a0ae91041f984ef18f606df0169ea8f3e +size 2358 diff --git a/Dataset/Embeddings/8820/8820_294120_000011_000001.pth b/Dataset/Embeddings/8820/8820_294120_000011_000001.pth new file mode 100644 index 0000000000000000000000000000000000000000..3759fbedcd3016c34a71c2f4953d8f4357234199 --- /dev/null +++ b/Dataset/Embeddings/8820/8820_294120_000011_000001.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b79ee0d4796df0776bc0ddfc8683f2a025c4829893f28b3cff6b4a2d5405d968 +size 2358 diff --git a/Dataset/Embeddings/mean.json b/Dataset/Embeddings/mean.json new file mode 100644 index 0000000000000000000000000000000000000000..a05338e3ed38749e7b317f90cb899ee2a81710ee --- /dev/null +++ b/Dataset/Embeddings/mean.json @@ -0,0 +1,258 @@ +[ + 0.21412190794944763, + 0.18206638097763062, + 0.11840786784887314, + 0.09126990288496017, + 0.04086871072649956, + -0.149668350815773, + 0.2645065188407898, + 0.27953410148620605, + 0.6700411438941956, + -0.06264923512935638, + 0.2915269732475281, + 0.12102372199296951, + -0.5578641891479492, + -0.12462181597948074, + 0.6190101504325867, + -0.5761605501174927, + -0.084229975938797, + -0.0006869725184515119, + 0.49899742007255554, + -0.21737882494926453, + -0.05707789212465286, + -0.18819154798984528, + -0.5531325340270996, + 0.22641371190547943, + 0.07952054589986801, + 0.09851367026567459, + 0.03574512526392937, + -0.13013364374637604, + -0.35363155603408813, + 0.49086689949035645, + 0.08895495533943176, + 0.36905843019485474, + -0.10707297921180725, + -0.11953406780958176, + 0.043051160871982574, + 0.09323996305465698, + -0.16280269622802734, + -0.13945965468883514, + 0.2095673531293869, + 0.09729334712028503, + 0.040950167924165726, + -0.37764972448349, + -0.018613651394844055, + -0.581308901309967, + -0.4080854058265686, + -0.42118221521377563, + 1.0161728858947754, + -0.19709929823875427, + -0.024254681542515755, + 0.04121233895421028, + -0.15502692759037018, + 0.7614311575889587, + -0.6833258271217346, + 0.33979618549346924, + 0.49055442214012146, + 0.011953921988606453, + 0.4490082263946533, + 0.2667522728443146, + -0.6408993005752563, + -0.17682728171348572, + 0.12336420267820358, + 0.1474267542362213, + -0.11565382778644562, + 0.6467825174331665, + 0.10751526057720184, + -0.14141449332237244, + 0.6352338194847107, + -0.04154682531952858, + 0.12760530412197113, + -0.6243913769721985, + 0.08836925774812698, + 0.28105032444000244, + -0.15209053456783295, + -0.0037005548365414143, + 0.3098902106285095, + 0.150644913315773, + 0.07396118342876434, + -0.049714382737874985, + -0.5445783138275146, + -0.033714842051267624, + 0.1200188472867012, + -0.2312866747379303, + 0.20238173007965088, + -0.5392364263534546, + -0.40682801604270935, + -0.16234233975410461, + -0.6470288634300232, + -0.1738162636756897, + 0.25936004519462585, + -0.15742169320583344, + 0.24468930065631866, + 0.13714095950126648, + 0.1449803113937378, + 0.16882915794849396, + 0.19944046437740326, + -0.29332247376441956, + 0.0026240404695272446, + 0.03341501206159592, + 0.01569036766886711, + -0.4688950777053833, + 0.09352052956819534, + 0.13269393146038055, + 0.06116529926657677, + -0.06562789529561996, + -0.23961076140403748, + -0.22402845323085785, + 0.47103151679039, + 0.0728374496102333, + -0.561316192150116, + 0.46127453446388245, + 0.15431830286979675, + 0.08550310134887695, + -0.03363621234893799, + 0.04015417397022247, + -0.014262784272432327, + 0.08499719202518463, + -0.39322608709335327, + 0.27674373984336853, + 0.24571490287780762, + -0.2642858326435089, + -0.7408877015113831, + 0.21007885038852692, + 0.5898057222366333, + 0.14988923072814941, + -0.07782910019159317, + 0.4078785479068756, + 0.3004123270511627, + 0.6256987452507019, + -0.21651767194271088, + -0.17712117731571198, + -0.2749980688095093, + 0.4826784133911133, + 0.3035520911216736, + 0.23235619068145752, + -0.061135340481996536, + 0.49035653471946716, + -0.16356635093688965, + -0.35920438170433044, + 0.023298246785998344, + 0.015880409628152847, + -0.015357445925474167, + -0.3540240228176117, + 0.44811102747917175, + -0.05202110856771469, + -0.19488674402236938, + 0.4875786602497101, + -0.03857485204935074, + 0.463600754737854, + -0.07009128481149673, + 0.29871219396591187, + -0.35601672530174255, + 0.5102726817131042, + 0.3902379274368286, + 0.3692609369754791, + -0.35389819741249084, + 0.07650414854288101, + -0.63330078125, + 0.5580229759216309, + 0.10672216862440109, + 0.10609150677919388, + 0.45468848943710327, + 0.15291742980480194, + 0.36706316471099854, + -0.2831500768661499, + -0.14291781187057495, + -0.17804013192653656, + -0.5424429178237915, + -0.15468499064445496, + 0.07343851029872894, + 0.5380398631095886, + 0.44494226574897766, + 0.9300274848937988, + -0.0274032074958086, + 0.3488404154777527, + -0.23694315552711487, + -0.2424279898405075, + -0.04125871881842613, + 0.06136211380362511, + -0.5118930339813232, + -0.15055209398269653, + 0.45361533761024475, + 0.12657225131988525, + 0.34210655093193054, + 0.313772052526474, + -0.3521589934825897, + 0.05892332270741463, + -0.11534406244754791, + 0.514985203742981, + 0.054903097450733185, + 0.18034562468528748, + 0.26060545444488525, + -0.29317837953567505, + 0.1423174887895584, + 0.25360995531082153, + -0.47162681818008423, + 0.5438259243965149, + 0.02562086470425129, + 0.020302919670939445, + 0.3039097189903259, + 0.19996808469295502, + 0.3423006236553192, + 0.4524010717868805, + -0.3152591586112976, + -0.60369873046875, + 0.16421166062355042, + -0.055804263800382614, + -0.35883089900016785, + 0.32918551564216614, + -0.4741072952747345, + 0.05971089377999306, + -0.062083590775728226, + 0.05729498714208603, + -0.6715519428253174, + 0.2646842896938324, + 0.14343565702438354, + 0.2957288324832916, + 0.37478363513946533, + -0.684753954410553, + -0.14382798969745636, + -0.3416562080383301, + 0.6120049953460693, + 0.24825794994831085, + 0.049689218401908875, + 0.08789665251970291, + -0.518900454044342, + -0.2226269692182541, + 0.17690403759479523, + 0.011226996779441833, + 0.05879935249686241, + 0.03022083267569542, + 0.11887083947658539, + 0.7854664325714111, + -0.2452417016029358, + 0.6136188507080078, + 0.5491909384727478, + -0.07412725687026978, + -0.3089025616645813, + 0.16618099808692932, + -0.03215228021144867, + 0.13637210428714752, + 0.10921650379896164, + -0.14989499747753143, + 0.6000584959983826, + 0.19014132022857666, + -0.007800411432981491, + -0.06849341839551926, + -0.19043166935443878, + -0.012874589301645756, + -0.8398106694221497, + -0.002614892553538084, + -0.26642924547195435, + 0.25869783759117126, + -0.46403658390045166, + 0.18120701611042023, + 0.08567068725824356, + 0.08117248862981796 +] \ No newline at end of file diff --git a/Dataset/Embeddings/std.json b/Dataset/Embeddings/std.json new file mode 100644 index 0000000000000000000000000000000000000000..8b628e81d92e4358e9f4d9959e7e821876b8dcac --- /dev/null +++ b/Dataset/Embeddings/std.json @@ -0,0 +1,258 @@ +[ + 0.8075656890869141, + 0.8826062679290771, + 0.8430591821670532, + 0.8703321814537048, + 0.877600371837616, + 0.8111068606376648, + 0.8719013929367065, + 0.9000007510185242, + 0.9740477800369263, + 0.8267052173614502, + 0.8011612296104431, + 0.9747788906097412, + 0.8026949763298035, + 0.8818342089653015, + 0.8605656623840332, + 0.8279756903648376, + 0.772606611251831, + 0.8957112431526184, + 0.8716765642166138, + 0.7797929644584656, + 0.8252673149108887, + 0.781441330909729, + 0.8043056130409241, + 0.877123236656189, + 0.9237406849861145, + 0.7914682030677795, + 0.9089431166648865, + 0.8154596090316772, + 0.8381725549697876, + 0.8573335409164429, + 0.7951206564903259, + 0.8356125354766846, + 0.8639358282089233, + 0.8588302135467529, + 0.8966045379638672, + 0.836276113986969, + 0.8558772206306458, + 0.8904256820678711, + 0.8009889721870422, + 0.9030625820159912, + 0.8489034175872803, + 0.7720499038696289, + 0.780423641204834, + 0.7854387760162354, + 0.8878417611122131, + 0.8503796458244324, + 0.8932433128356934, + 0.9315906763076782, + 0.8437496423721313, + 0.8389645218849182, + 0.8701387643814087, + 0.9080750942230225, + 1.0714792013168335, + 0.8976108431816101, + 0.8437362909317017, + 0.8633260726928711, + 0.8580045700073242, + 0.8063361644744873, + 0.8105617761611938, + 0.8995920419692993, + 0.8316185474395752, + 0.9079830050468445, + 0.8115889430046082, + 0.8792805671691895, + 0.8858475685119629, + 0.7682526111602783, + 0.8312106728553772, + 0.8296751379966736, + 0.9122119545936584, + 0.9119444489479065, + 0.8761489391326904, + 0.8376705646514893, + 0.9226043820381165, + 0.8830709457397461, + 0.819685161113739, + 0.9397792816162109, + 0.833674967288971, + 0.8619604110717773, + 0.8484258651733398, + 0.943915605545044, + 0.8020740151405334, + 0.8027610182762146, + 0.9116966724395752, + 0.8570717573165894, + 0.7944185733795166, + 0.8977150917053223, + 0.9434093236923218, + 0.9964787364006042, + 0.8149264454841614, + 0.8179062604904175, + 0.832256555557251, + 0.866649329662323, + 0.8442603349685669, + 0.9397143125534058, + 0.8501031398773193, + 0.9365203380584717, + 0.8380716443061829, + 0.8887302279472351, + 0.8084500432014465, + 0.7769243121147156, + 0.8449881076812744, + 0.9015783667564392, + 0.9295680522918701, + 0.8259174227714539, + 0.8573725819587708, + 0.8600193858146667, + 0.8780449032783508, + 0.8595342040061951, + 0.7720226049423218, + 0.816754937171936, + 0.8180097937583923, + 0.8093970417976379, + 0.9032255411148071, + 0.8697183728218079, + 0.888511061668396, + 0.7960647940635681, + 0.8589795827865601, + 0.8813145160675049, + 0.8638142347335815, + 0.9093354344367981, + 0.8201130628585815, + 0.8607465028762817, + 0.9925655722618103, + 0.9680612683296204, + 0.8303309679031372, + 0.8515812158584595, + 0.8854086399078369, + 0.8599415421485901, + 0.8196620941162109, + 0.9137897491455078, + 0.8218133449554443, + 0.8703830242156982, + 0.845089852809906, + 0.8652607202529907, + 0.877587080001831, + 0.834847629070282, + 0.7999405860900879, + 0.867475152015686, + 0.9779040217399597, + 0.8888542652130127, + 0.8318555951118469, + 0.8721846342086792, + 0.8582359552383423, + 0.8781721591949463, + 0.7750568389892578, + 0.9456684589385986, + 0.8390375971794128, + 0.8528217077255249, + 0.9676473736763, + 0.9669485092163086, + 0.8177183866500854, + 0.8109471201896667, + 0.8565740585327148, + 1.012668490409851, + 0.8075276017189026, + 0.8120420575141907, + 0.8192445039749146, + 0.9088258743286133, + 0.806582510471344, + 0.8778362274169922, + 0.9832965135574341, + 0.8517345190048218, + 0.8954508900642395, + 0.8626090288162231, + 0.8306634426116943, + 0.7902420163154602, + 0.8680355548858643, + 0.8405691385269165, + 0.8080191612243652, + 0.8716298937797546, + 0.8520878553390503, + 0.8133600354194641, + 0.9267045855522156, + 0.8689888715744019, + 0.8166713118553162, + 0.8387840390205383, + 0.835797131061554, + 0.8922353386878967, + 0.8736470937728882, + 0.9051007032394409, + 0.8347994685173035, + 0.8269197344779968, + 0.7968848943710327, + 0.8677981495857239, + 0.8539698719978333, + 0.9122839570045471, + 0.907562255859375, + 0.908149242401123, + 0.8897758722305298, + 0.8776298761367798, + 0.8702916502952576, + 0.7712435722351074, + 0.8737289905548096, + 1.003007411956787, + 0.9195813536643982, + 0.9373644590377808, + 0.8549340963363647, + 0.8885018229484558, + 0.8555989265441895, + 0.8315033316612244, + 0.8457157611846924, + 0.8452540636062622, + 0.9597710967063904, + 0.8279005885124207, + 0.9954813122749329, + 0.8817158937454224, + 0.8564739227294922, + 0.8737724423408508, + 0.8833761215209961, + 0.9069574475288391, + 0.8549059629440308, + 0.8478658199310303, + 0.8306840062141418, + 0.8308926820755005, + 0.8582388162612915, + 0.7912089228630066, + 0.843919038772583, + 0.8585576415061951, + 0.850679337978363, + 0.921983003616333, + 0.8164607882499695, + 0.8369028568267822, + 0.7947129607200623, + 0.8371235132217407, + 0.8269281387329102, + 0.8633431196212769, + 0.9147580862045288, + 0.9019842743873596, + 0.8293289542198181, + 0.8421900868415833, + 0.8144598603248596, + 0.9013247489929199, + 0.7653704285621643, + 0.8295224905014038, + 0.9549149870872498, + 0.8671613931655884, + 0.8507492542266846, + 0.8559182286262512, + 0.839141309261322, + 0.918213427066803, + 0.9064037203788757, + 0.8579128980636597, + 0.8337833881378174, + 0.9374175071716309, + 0.9142330884933472, + 0.7878691554069519, + 0.8651018142700195, + 0.8595719933509827, + 0.8955603837966919, + 0.9085484743118286, + 0.8001472353935242, + 0.7812052369117737, + 0.8475046157836914, + 0.8226194381713867, + 0.8940064311027527, + 0.9277697801589966 +] \ No newline at end of file diff --git a/Dataset/dataset.yaml b/Dataset/dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0715c87a69570f45eb66a231b12d8f9a39865247 --- /dev/null +++ b/Dataset/dataset.yaml @@ -0,0 +1,67 @@ +dataset: + '7190_90542_000054_000000': + speaker_id: '7190' + example_id: '7190_90542_000054_000000' + '4830_25904_000008_000001': + speaker_id: '4830' + example_id: '4830_25904_000008_000001' + '8820_294120_000011_000001': + speaker_id: '8820' + example_id: '8820_294120_000011_000001' + '3009_10327_000027_000005': + speaker_id: '3009' + example_id: '3009_10327_000027_000005' + '7226_86965_000020_000001': + speaker_id: '7226' + example_id: '7226_86965_000020_000001' + '329_861_000024_000003': + speaker_id: '329' + example_id: '329_861_000024_000003' + '5802_76044_000038_000000': + speaker_id: '5802' + example_id: '5802_76044_000038_000000' + '1535_141644_000004_000001': + speaker_id: '1535' + example_id: '1535_141644_000004_000001' + '7011_66622_000032_000002': + speaker_id: '7011' + example_id: '7011_66622_000032_000002' + '8758_296465_000020_000000': + speaker_id: '8758' + example_id: '8758_296465_000020_000000' + '1034_121119_000028_000001': + speaker_id: '1034' + 'example_id': '1034_121119_000028_000001' + '4957_30119_000070_000001': + speaker_id: '4957' + example_id: '4957_30119_000070_000001' + '83_9960_000017_000003': + speaker_id: '83' + example_id: '83_9960_000017_000003' + '7059_77897_000017_000001': + speaker_id: '7059' + example_id: '7059_77897_000017_000001' + '1731_142320_000122_000005': + speaker_id: '1731' + example_id: '1731_142320_000122_000005' + '6918_47541_000006_000008': + speaker_id: '6918' + example_id: '6918_47541_000006_000008' + '6544_71420_000024_000001': + speaker_id: '6544' + example_id: '6544_71420_000024_000001' + '7245_104888_000016_000000': + speaker_id: '7245' + example_id: '7245_104888_000016_000000' + '5012_80192_000020_000003': + speaker_id: '5012' + example_id: '5012_80192_000020_000003' + '1422_149735_000006_000000': + speaker_id: '1422' + example_id: '1422_149735_000006_000000' + '14_212_000019_000000': + speaker_id: '14' + example_id: '14_212_000019_000000' + '1088_129236_000006_000007': + speaker_id: '1088' + example_id: '1088_129236_000006_000007' diff --git a/app.py b/app.py index 4ec513dcc365c02f07d1115d0a3e56900c0095e7..118d2d5b53a4bc00a513291ee5f31c6b6a5256aa 100644 --- a/app.py +++ b/app.py @@ -1,31 +1,39 @@ import numpy as np from pathlib import Path -import padertorch as pt import paderbox as pb -import time import torch -import torchaudio from onnxruntime import InferenceSession from pvq_manipulation.models.vits import Vits_NT from pvq_manipulation.models.ffjord import FFJORD -from IPython.display import display, Audio, clear_output from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER import librosa from pvq_manipulation.helper.vad import EnergyVAD import gradio as gr +from pvq_manipulation.helper.creapy_wrapper import process_file -device = 'cpu' #'cuda' if torch.cuda.is_available() else 'cpu' +device = 'cuda' if torch.cuda.is_available() else 'cpu' +pvq_labels = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch'] -# load tts model -storage_dir_tts = Path("./models/tts_model/") -tts_model = Vits_NT.load_model(storage_dir_tts, "model.pt") +dataset_dict = pb.io.load_yaml('./Dataset/dataset.yaml') + +cached_example_id = None +cached_loaded_example = None +cached_labels = None +cached_d_vector = None +cached_unmanipulated = None + +# path to stats +stats_path = Path('./Dataset/Embeddings/') # load normalizing flow storage_dir_normalizing_flow = Path("./models/norm_flow") -speaker_conditioning = pb.io.load(storage_dir_normalizing_flow / "speaker_conditioning.json") - +config_norm_flow = pb.io.load_yaml(storage_dir_normalizing_flow / "config.json") normalizing_flow = FFJORD.load_model(storage_dir_normalizing_flow, checkpoint="model.pt", device=device) +# load tts model +storage_dir_tts = Path("./models/tts_model/") +tts_model = Vits_NT.load_model(storage_dir_tts, "model.pt") + # load hubert features model hubert_model = HubertExtractor( layer=SID_LARGE_LAYER, @@ -35,140 +43,157 @@ hubert_model = HubertExtractor( # storage_dir= # target storage dir hubert model ) -# example synthesis -# speaker_id = 1034 -# example_id = "1034_121119_000028_000001" - -# wav_1 = tts_model.synthesize_from_example({ -# 'text' : "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", -# 'd_vector_storage_root': f"./Saved_models/Dataset/Embeddings/{speaker_id}/{example_id}.pth" -# }) -# display(Audio(wav_1, rate=24_000, normalize=True)) -# manipulation block def get_manipulation( - d_vector, + example, labels, - flow, + flow, tts_model, + d_vector, + config_norm_flow, manipulation_idx=0, manipulation_fkt=1, ): labels_manipulated = labels.clone() - labels_manipulated[:,manipulation_idx] += manipulation_fkt - - output_forward = flow.forward((d_vector.float(), labels))[0] + labels_manipulated[:, manipulation_idx] += manipulation_fkt + + if config_norm_flow['flag_remove_mean']: + global_mean = pb.io.load(stats_path / "mean.json") + global_mean = torch.tensor(global_mean, dtype=torch.float32) + speaker_embedding_norm = (d_vector - global_mean) + global_std = pb.io.load(stats_path / "std.json") + global_std = torch.tensor(global_std, dtype=torch.float32) + speaker_embedding_norm = speaker_embedding_norm / global_std + else: + speaker_embedding_norm = d_vector + + output_forward = flow.forward((speaker_embedding_norm.float(), labels))[0] sampled_class_manipulated = flow.sample((output_forward, labels_manipulated))[0] + if config_norm_flow['flag_remove_mean']: + sampled_class_manipulated = (sampled_class_manipulated * global_std + global_mean) + wav = tts_model.synthesize_from_example({ - 'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + 'text': example['transcription'], 'd_vector': d_vector.detach().numpy(), 'd_vector_man': sampled_class_manipulated.detach().numpy(), - }) + 'd_vector_storage_root': example['d_vector_storage_root'], + }) return wav -def extract_speaker_embedding(example): - observation, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True) - observation = librosa.resample(observation, orig_sr=sr, target_sr=16_000) - - vad = EnergyVAD(sample_rate=16_000) - if observation.ndim == 1: - observation = observation[None, :] - - observation = vad({'audio_data': observation})['audio_data'] - - with torch.no_grad(): - example = tts_model.speaker_manager.prepare_example({'audio_data': {'observation': observation}, **example}) - example = pt.data.utils.collate_fn([example]) - example['features'] = torch.tensor(np.array(example['features'])) - d_vector = tts_model.speaker_manager.forward(example)[0] - return d_vector - -# load speaker labels -def load_speaker_labels(example, speaker_conditioning, reg_stor_dir=Path('./models/pvq_extractor/')): - audio, _ = torchaudio.load(example['audio_path']['observation']) - audio = audio.to(device) - num_samples = torch.tensor([audio.shape[-1]], device=device) +def get_creak_label(example): + audio_data = example['loaded_audio_data']['16_000'] + test, y_pred, included_indices = process_file(audio_data) + mean_creak = np.mean(y_pred[included_indices]) + return mean_creak * 100 + + +def load_speaker_labels(example, reg_stor_dir=Path('./models/pvq_extractor/')): + audio_data = torch.tensor(example['loaded_audio_data']['16_000'], dtype=torch.float)[None, :] + num_samples = torch.tensor([audio_data.shape[-1]]) + + if torch.cuda.is_available(): + audio_data = audio_data.cuda() + num_samples = num_samples.cuda() providers = ["CPUExecutionProvider"] with torch.no_grad(): features, seq_len = hubert_model( - audio, - 24_000, + audio_data, + 16_000, sequence_lengths=num_samples, ) features = np.mean(features.squeeze(0).detach().cpu().numpy(), axis=-1) - pvqd_predictions = {} - for pvq in ['Breathiness', 'Loudness', 'Pitch', 'Resonance', 'Roughness', 'Strain', 'Weight']: + for pvq in pvq_labels: with open(reg_stor_dir / f"{pvq}.onnx", "rb") as fid: onnx = fid.read() sess = InferenceSession(onnx, providers=providers) pred = sess.run(None, {"X": features[None]})[0].squeeze(1) pvqd_predictions[pvq] = pred.tolist()[0] - labels = [] - for key in speaker_conditioning: - labels.append(pvqd_predictions[key]/100) - return torch.tensor(labels) + pvqd_predictions['Creak_mean'] = get_creak_label(example) + labels = [pvqd_predictions[key] / 100 for key in pvq_labels + ["Creak_mean"]] + return torch.tensor(labels, device=device).float() + + +def load_audio_files(example): + observation_loaded, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True) + + example['loaded_audio_data'] = {} + observation = librosa.resample(observation_loaded, orig_sr=sr, target_sr=16_000) + + vad = EnergyVAD(sample_rate=16_000) + if observation.ndim == 1: + observation = observation[None, :] + + observation = vad({'audio_data': observation})['audio_data'] + example['loaded_audio_data']['16_000'] = observation + + observation = librosa.resample(observation, orig_sr=sr, target_sr=24_000) + vad = EnergyVAD(sample_rate=24_000) + if observation.ndim == 1: + observation = observation[None, :] + observation = vad({'audio_data': observation})['audio_data'] + example['loaded_audio_data']['24_000'] = observation + return example -example = { - 'audio_path': {'observation': "audio/1034_121119_000028_000001.wav"}, - 'speaker_id': 1034, - 'example_id': "1034_121119_000028_000001", -} -labels = load_speaker_labels(example, speaker_conditioning) -label_options = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch'] +def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt): + global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated -# print('Estimated PVQ strengths of input speaker:') -# max_len = max(len(name) for name in label_options) -# for label_name, pvq in zip(label_options, labels): - # print(f'{label_name:<{max_len}} : {pvq:6.2f}') + speaker_id = dataset_dict['dataset'][example_id]['speaker_id'] + example = { + 'audio_path': {'observation': f"./Dataset/Audio_files/{example_id}.wav"}, + 'd_vector_storage_root': f"./Saved_models/Dataset/Embeddings/{speaker_id}/{example_id}.pth", + 'speaker_id': speaker_id, + 'example_id': example_id, + 'transcription': transcription + } -def update_manipulation(manipulation_idx, manipulation_fkt): + if cached_example_id != example_id: + cached_loaded_example = load_audio_files(example) + cached_d_vector = torch.load(f"./Dataset/Embeddings/{speaker_id}/{example_id}.pth") + cached_labels = load_speaker_labels(example) + cached_example_id = example_id + cached_unmanipulated = tts_model.synthesize_from_example({ + 'text': transcription, + 'd_vector': cached_d_vector.detach().numpy(), + }) - d_vector = extract_speaker_embedding(example) - labels = load_speaker_labels(example, speaker_conditioning) - wav_manipulated = get_manipulation( - # example=example, - d_vector=d_vector, - labels=labels[None, :], + example=example, + d_vector=cached_d_vector, + labels=cached_labels[None, :], flow=normalizing_flow, tts_model=tts_model, manipulation_idx=manipulation_idx, manipulation_fkt=manipulation_fkt, + config_norm_flow=config_norm_flow, ) - - wav_unmanipulated = tts_model.synthesize_from_example({ - 'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - 'd_vector': d_vector.detach().numpy(), - }) - sr = 24_000 - return (sr, wav_unmanipulated), (sr, wav_manipulated) - - # with audio_output: - # clear_output(wait=True) - # print('Manipulated Speaker') - # display(Audio(wav_manipulated, rate=24_000, normalize=True)) - # print('Unmanipulated Synthese') - # display(Audio(wav_unmanipulated, rate=24_000, normalize=True)) - # print('Original Speaker') - # display(Audio(example['audio_path']['observation'], rate=24_000, normalize=True)) - - # print(f"Manipulated {label_options[manipulation_idx]} with strength {manipulation_fkt}") - - -dropdown_options = [(label, i) for i, label in enumerate(label_options)] + return (24_000, cached_unmanipulated), (24_000, wav_manipulated) + + demo = gr.Interface( title="Perceptual Voice Quality (PVQ) Manipulation", fn=update_manipulation, inputs=[ - gr.Dropdown(label="PVQ Feature", choices=dropdown_options, value=2, type="index"), - gr.Slider(label="Manipulation Factor", minimum=-2.0, maximum=2.0, value=1.0, step=0.1), + gr.Dropdown( + label="PVQ Feature", + choices=[('Weight', 0), ('Resonance', 1), ('Breathiness', 2), ('Roughness', 3), ('Creak', 7)], + value=2, type="value" + ), + gr.Dropdown( + choices=dataset_dict['dataset'].keys(), + value='1422_149735_000006_000000', type="value" + ), + gr.Textbox( + value="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + placeholder='Type something' + ), + gr.Slider(label="Manipulation Factor", minimum=-1.0, maximum=2.0, value=1.0, step=0.1), ], outputs=[gr.Audio(label="original utterance"), gr.Audio(label="manipulated utterance")], ) diff --git a/models/norm_flow/config.json b/models/norm_flow/config.json index 520ef09629eaf9c4ade198b681bb55afbb82ee49..b890fe179bd19694d0546235893e512f9aadb487 100644 --- a/models/norm_flow/config.json +++ b/models/norm_flow/config.json @@ -1,12 +1,15 @@ { - "factory": "pvq_manipulation.models.ffjord.FFJORD", - "normalize": true, - "ode_function": { - "condition_dim": 7, - "factory": "pvq_manipulation.models.ode_functions.CNFNN", - "hidden_channels": [ - 512 - ], - "input_dim": 256 - } -} \ No newline at end of file + "model":{ + "factory": "pvq_manipulation.models.ffjord.FFJORD", + "normalize": true, + "ode_function": { + "condition_dim": 8, + "factory": "pvq_manipulation.models.ode_functions.CNFNN", + "hidden_channels": [ + 512 + ], + "input_dim": 256 + } + }, + "flag_remove_mean": true +} diff --git a/models/norm_flow/model.pt b/models/norm_flow/model.pt index 2d35cb25436a89869453af29dffda8237ce41d14..c86dabaf09245fb02893a473565bea229ee96584 100644 --- a/models/norm_flow/model.pt +++ b/models/norm_flow/model.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c5d5f64d3413a684254fdbdcc9c5d4c2c311946ff4c6724708ea4a8d332783d -size 1120055 +oid sha256:d1110d37007980fcd8f28525ff39632e6fbade8160891fc079dbeb8777bb4d49 +size 1125666 diff --git a/models/norm_flow/speaker_conditioning.json b/models/norm_flow/speaker_conditioning.json deleted file mode 100644 index dfec37f9a080571d00f925c3c32166296f48bbdb..0000000000000000000000000000000000000000 --- a/models/norm_flow/speaker_conditioning.json +++ /dev/null @@ -1,9 +0,0 @@ -[ - "Weight", - "Resonance", - "Breathiness", - "Roughness", - "Loudness", - "Strain", - "Pitch" -] \ No newline at end of file diff --git a/old_README.md b/old_README.md deleted file mode 100644 index f87977c9a91fd245a7ee64983d249869ae7f03e3..0000000000000000000000000000000000000000 --- a/old_README.md +++ /dev/null @@ -1,30 +0,0 @@ -# PVQ Manipulation - -This repository contains code for manipulating perceptual voice quality (PVQ) features, intended for experiments and synthesis using models such as YourTTS. - ---- - -## Installation - -### Clone the repository and install - -```sh -git clone https://github.com/FrederikRautenberg/pvq_manipulation.git -cd pvq_manipulation -pip install -e . -``` - -### Install [YourTTS](https://github.com/coqui-ai/TTS) from -``` -git clone https://github.com/coqui-ai/TTS -cd TTS -pip install -e .[all,dev,notebooks] # Select the relevant extras -``` -### Make sure that [Paderbox](https://github.com/fgnt/paderbox) and [Padertorch](https://github.com/fgnt/padertorch?tab=readme-ov-file) are installed from -``` -git clone https://github.com/fgnt/paderbox.git -cd paderbox -pip install --editable .[all] -git clone https://github.com/fgnt/padertorch.git -cd padertorch && pip install -e .[all] -``` diff --git a/pvq_manipulation/helper/creapy_wrapper.py b/pvq_manipulation/helper/creapy_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..c3174ac64b164bccd343d715ff92bc87966f07a1 --- /dev/null +++ b/pvq_manipulation/helper/creapy_wrapper.py @@ -0,0 +1,236 @@ +from __future__ import annotations + +import parselmouth as pm +import warnings +import numpy as np +import pandas as pd +from scipy.signal.windows import hann +from pathlib import Path +from sklearn.impute import SimpleImputer + +import creapy +from creapy.feature_extraction.feature_extraction import _cpp, _h1_h2, _jitter, _shimmer, _f0mean, _zcr, _ste +from sklearn.ensemble import RandomForestClassifier +from sklearn.neural_network import MLPClassifier + + +class Model: + """The Model for creaky voice classification.""" + def __init__(self): + self._config = creapy.utils.get_config()["MODEL"] + self._X_train: pd.DataFrame + self._y_train: pd.Series + self._imputer: SimpleImputer + self._features = self._config["FEATURES"]["for_classification"] + self._fitted = False + _clf = self._config["CLASSIFIER"]["clf"] + self._clf = clfs[_clf]( + **self._config["CLASSIFIER"]["VALUES"][_clf.upper()]["kwargs"]) + + def fit(self, X_train: pd.DataFrame, y_train: pd.DataFrame): + """Function to fit the model with training data. + + Args: + X_train (pd.DataFrame): Features of training data. + y_train (pd.Dataframe): Targets of training data (creak, no-creak). + """ + if isinstance(y_train, pd.DataFrame): + y_train = y_train.to_numpy() + if self._config["PREPROCESSING"]["impute_at_fit"] is True: + self._X_train, self._imputer = creapy.preprocessing.impute( + X_train=X_train.loc[:, self._features], return_imputer=True) + else: + self._X_train = X_train + self._y_train = pd.Series(y_train, name=self._config["target_label"]) + self._clf.fit( + self._X_train.loc[:, self._features], self._y_train) + self._fitted = True + + def predict(self, X_test: pd.DataFrame, predict_proba: bool=None) -> np.ndarray: + """Predicts the given features. + + Args: + X_test (pd.DataFrame): Features to be predicted. + predict_proba (bool, optional): If `True` the likelihood to be creak will be returned, else the predicted target. + Defaults to None. + + Returns: + np.ndarray: Predicted targets, or probability of creak. + """ + self._config = creapy.utils.get_config()["MODEL"] + if predict_proba is not None: + assert isinstance(predict_proba, bool) + else: + predict_proba = self._config["CLASSIFIER"]["predict_proba"] + if hasattr(self, "_imputer"): + X_test = pd.DataFrame(self._imputer.transform( + X_test.loc[:, self._features]), columns=self._X_train.columns, index=X_test.index) + if predict_proba is True: + _target_index = np.argwhere( + self._clf.classes_ == self._config["CLASSIFIER"]["target_name"]).item() + y_pred = self._clf.predict_proba(X_test[self._features])[ + :, _target_index].flatten() + if self._config["POSTPROCESSING"]["MAVG"]["mavg"] is True: + length, mode = map( + self._config["POSTPROCESSING"]["MAVG"]["VALUES"].get, ("length", "mode")) + y_pred = creapy.postprocessing.moving_average(y_pred, length, mode) + else: + y_pred = self._clf.predict(X_test[self._features]) + + return y_pred + + +def read_wav( + data, + sr, + normalize: bool = True, + start: float = 0.0, + end: float | int = -1, + mono=True +) -> tuple[np.ndarray, int]: + if mono is True and data.ndim > 1: + data = data.sum(axis=1) / data.shape[1] + + max_ = max(abs(data)) + if end == -1: + data = data[int(start*sr):] + else: + data = data[int(start*sr):int(end*sr)] + + if normalize is True: + data /= max_ + + return data, sr + + +def _hnr(data: np.ndarray, sound: pm.Sound, sr) -> float: + try: + harmonicity = sound.to_harmonicity() + except pm.PraatError: + hnr = np.nan + else: + # taken from + # https://parselmouth.readthedocs.io/en/stable/examples/batch_processing.html?highlight=harmonicity#Batch-processing-of-files + # check if empty + valid_values = harmonicity.values[harmonicity.values != -200] + if valid_values.size > 0: + hnr = valid_values.mean() + else: + hnr = np.nan + return hnr + + +def blockwise_feature_calculation(data: np.ndarray, sr, feature): + + sounds = [pm.Sound(values=block, sampling_frequency=sr) for block in data] + function = FEATURE_MAPPING[feature] + res = [function(block, sound, sr) for block, sound in zip(data, sounds)] + return np.array(res) + + +def process_file(data, sample_rate: int = 16_000): + _config = creapy.config.get_config() + user_cfg = _config['USER'] + model_cfg = _config['MODEL'] + + start, end = user_cfg['audio_start'], user_cfg['audio_end'] + data, sr = read_wav(data, sample_rate, start=start, end=end) + + w = hann(int(user_cfg["block_size"] * sample_rate)) + creak_data_buff = creapy.preprocessing.buffer(data, sample_rate, window=w) + data_buffer = creak_data_buff.T + + unvoiced_excl = model_cfg['PREPROCESSING']['UNVOICED_EXCLUSION'] + preprocessing_features = [key for key, val in unvoiced_excl.items() if val is True] + + elimination_chunks = np.stack([ + blockwise_feature_calculation( + data_buffer, sample_rate, feature + ) for feature in preprocessing_features + ], axis=1) + + preproc_values = unvoiced_excl['VALUES'] + preproc_values['ZCR']['threshold'] = user_cfg['zcr_threshold'] + preproc_values['STE']['threshold'] = user_cfg['ste_threshold'] + + thresholds = np.array([ + creapy.postprocessing.thresholding( + series=elimination_chunks[:, i], + **preproc_values[feature.upper()] + ) + for i, feature in enumerate(preprocessing_features) + ]) + included_indices = thresholds.sum(axis=0) == 0 + + if not np.any(included_indices): + warnings.warn("Did not make classification. Adjust ZCR/STE thresholds.") + y_pred = np.zeros(creak_data_buff.shape[1]) + X_test = pd.DataFrame(elimination_chunks, columns=preprocessing_features) + return X_test, y_pred, included_indices + + class_features = model_cfg["FEATURES"]["for_classification"] + X_class = np.stack([ + blockwise_feature_calculation( + data_buffer[included_indices], sample_rate, feature + ) for feature in class_features + ], axis=1) + + _X_test = pd.DataFrame( + X_class, + columns=class_features, + index=np.flatnonzero(included_indices) + ) + + X_all = np.zeros((elimination_chunks.shape[0], elimination_chunks.shape[1] + len(class_features))) + X_all[:, :elimination_chunks.shape[1]] = elimination_chunks + X_all[included_indices, elimination_chunks.shape[1]:] = X_class + + X_test = pd.DataFrame(X_all, columns=preprocessing_features + class_features) + + y_pred = np.zeros(creak_data_buff.shape[1]) + gender_model = user_cfg['gender_model'] + model_path = creapy.utils.helpers.get_root() / model_cfg["model_location"] + model_path = (model_path.parent / f"{model_path.stem}_{gender_model.upper()}").with_suffix(".csv") + model = load_model(model_path) + + y_pred[included_indices] = model.predict(_X_test) + + return X_test, y_pred, included_indices + + +def load_model(filepath: str = None) -> Model: + """Loads a already fitted model from a csv file. + + Args: + filepath (str, optional): Location of the model csv file. Defaults to None. + + Returns: + Model: Fitted Model for creak classification. + """ + filepath = Path(filepath) + + _config = creapy.utils.get_config() + _X_combined = pd.read_csv(filepath) + model = Model() + _target_column = _config["MODEL"]["target_label"] + _feature_columns = _config["MODEL"]["FEATURES"]["for_classification"] + _X_train, _y_train = _X_combined[_feature_columns], _X_combined[_target_column] + model.fit(_X_train, _y_train) + return model + + +FEATURE_MAPPING = { + "cpp": _cpp, + "hnr": _hnr, + "h1h2": _h1_h2, + "jitter": _jitter, + "shimmer": _shimmer, + "f0mean": _f0mean, + "zcr": _zcr, + "ste": _ste, +} + +clfs = { + "rfc": RandomForestClassifier, + "mlp": MLPClassifier +} diff --git a/pvq_manipulation/models/ffjord.py b/pvq_manipulation/models/ffjord.py index ad830730cd439bbfd7cb6ca5a7c5571e312555d1..0feebe2e4b8a242a27901e4581d2cd02e3c3631c 100644 --- a/pvq_manipulation/models/ffjord.py +++ b/pvq_manipulation/models/ffjord.py @@ -113,7 +113,7 @@ class FFJORD(Model): if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model_dict = pb.io.load(model_path / "config.json") - model = Model.from_config(model_dict) + model = Model.from_config(model_dict['model']) cp = torch.load( model_path / checkpoint, map_location=device, diff --git a/pvq_manipulation/models/vits.py b/pvq_manipulation/models/vits.py index 0bbcefc53b18a981ee6c8ca4b6c7e72c7bef2ba3..2a58c5c4c00b462fc7a628927d2e819b8819cb31 100644 --- a/pvq_manipulation/models/vits.py +++ b/pvq_manipulation/models/vits.py @@ -28,7 +28,8 @@ from TTS.utils.audio import AudioProcessor from TTS.vocoder.models.hifigan_generator import HifiganGenerator from trainer.trainer import to_cuda from typing import Dict, List, Union - +if not torch.cuda.is_available(): + from concurrent.futures import ThreadPoolExecutor class Vits_NT(Vits): def __init__( @@ -205,12 +206,13 @@ class Vits_NT(Vits): Returns: - model_outputs (torch.Tensor): (batch_size, T_wav) Synthesized waveform """ - speaker_embedding = aux_input['d_vector'].detach()[:, :, None] - if aux_input['d_vector_man'] is not None: - speaker_embedding_man = aux_input['d_vector_man'].detach()[:, :, None] + speaker_embedding = aux_input['d_vector'] + if 'd_vector_man' in aux_input.keys() and aux_input['d_vector_man'] is not None: + speaker_embedding_man = aux_input['d_vector_man'] else: speaker_embedding_man = speaker_embedding - aux_input['tokens'] = x.clone() + + aux_input['tokens'] = x x_lengths = self._set_x_lengths(x, aux_input) x, m_p, logs_p, x_mask = self.text_encoder( x, @@ -220,7 +222,7 @@ class Vits_NT(Vits): logw = self.duration_predictor( x, x_mask, - g=speaker_embedding, + g=speaker_embedding[:, :, None], lang_emb=None, ) @@ -231,21 +233,41 @@ class Vits_NT(Vits): attn_mask = x_mask * y_mask.transpose(1, 2) attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1).transpose(1, 2)) - m_p = torch.matmul(attn.transpose(1, 2), m_p.transpose(1, 2)).transpose(1, 2) - logs_p = torch.matmul(attn.transpose(1, 2), logs_p.transpose(1, 2)).transpose(1, 2) + + m_p = torch.einsum('blm, bnl -> bnm', attn, m_p) + logs_p = torch.einsum('blm, bnl -> bnm', attn, logs_p) z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * self.inference_noise_scale - z = self.flow(z_p, y_mask, g=speaker_embedding_man, reverse=True) + z = self.flow(z_p, y_mask, g=speaker_embedding_man[:, :, None], reverse=True) z, _, _, y_mask = self.upsampling_z( z, y_lengths=y_lengths, y_mask=y_mask ) - o = self.waveform_decoder( - (z * y_mask)[:, :, : self.max_inference_len], - g=speaker_embedding_man if self.config.gan_speaker_conditioning else None - ) + + if not torch.cuda.is_available(): + num_chunks = 2 + chunk_size = z.shape[-1] // num_chunks + z_chunks = torch.split(z, chunk_size, dim=-1) + + def decode_chunk(z_chunk): + return self.waveform_decoder( + z_chunk, + g=speaker_embedding_man[:, :, None] if self.config.gan_speaker_conditioning else None + ) + + with ThreadPoolExecutor(max_workers=num_chunks) as executor: + futures = [executor.submit(decode_chunk, chunk) for chunk in z_chunks] + results = [f.result() for f in futures] + + o = torch.cat(results, dim=-1) + + else: + o = self.waveform_decoder( + (z * y_mask)[:, :, : self.max_inference_len], + g=speaker_embedding_man[:, :, None] if self.config.gan_speaker_conditioning else None + ) return o def forward(self, x, x_lengths, y, y_lengths, aux_input, inference=False): diff --git a/requirements.txt b/requirements.txt index 98d52a14a4ffa47d55c13c18ddd157f35f59c2ba..00c55f3bd5b1fe3118dd5a65e72cdb350583ff2c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ TTS==0.22.0 padertorch onnxruntime torchdiffeq +git+https://gitlab.tugraz.at/speech/creapy.git