https://github.com/myshell-ai/OpenVoice
OpenVoice์ ๊ด์ฌ์ ๊ฐ์ง๊ฒ ๋ ์ด์ ๋, ํ๋๋ค.
"๋ชฉ์๋ฆฌ ๋ณต์ ๋ฅผ ํ๊ณ ์ถ๋ค. ๋ด๊ฐ ์ข์ํ๋ ๋ชฉ์๋ฆฌ๋ก ํ๊ธ ํ ์คํธ๋ฅผ ๋ฌด์ ํ์ผ๋ก ์ฝ์ด์คฌ์ผ๋ฉด ์ข๊ฒ ๋ค"
๊ธฐ์กด tts๋ ๊ธธ์ด์ ํ์ด ์๊ณ ์ฌ์ฉํ๊ธฐ๋ ๋ณต์กํ๊ธฐ์ ์ด๊ฒ์ ๋๋ค๋ฆฌ์ฒ๋ผ ์์ฃผ ๊ฐ๋จํ๊ฒ
1. ๋์ฉ๋ ํ ์คํธ ํ์ผ ์ฒจ๋ถ
2. ๋ชฉ์๋ฆฌ ์์ฑ ๋ฒํผ ํด๋ฆญ
3. wavํ์ผ ์์ฑ
ํด์ฃผ๋ ์ฑ์ ๋ง๋ค๊ณ ์ถ์๋ค. ํ์ฌ๊น์ง r&d ๊ฒฐ๊ณผ๋ก๋ ๋ถ์ ์ ์ด์ง๋ง, ์ด ๋ถ์ ์ ๊ฒฐ๋ก ๊น์ง ๋๋ฌํ ๊ณผ์ ์ ๊ธฐ๋ก์ผ๋ก ๋จ๊ธฐ๋ คํ๋ค.
์ผ๋จ, V1์ ๋ค๊ตญ์ด ์ง์์ด ์๋๋ค.
์์ด์ ์ค๊ตญ์ด๋ง ๋๋ค. ๋ง์ฝ ํ๊ตญ์ด ๋ฅผ ์ฝ๊ฒ ํ๋ฉด ์๋์ ๊ฐ์ด ์ฝ์ด์ค๋ค. ๋ค์ดํฐ๋ธ ๋ฏธ๊ตญ์ธ์ด ํ๊ตญ์ด ๋งํ๋๊ฑฐ ๊ฐ๋ค.
* ๊นํ์์ ์ค์นํ๊ธฐ
git clone https://github.com/myshell-ai/OpenVoice.git open_voice
cd open_voice
* ํ๊ฒฝ๋ง๋ค๊ธฐ - ํ์ด์ฌ ๋ฒ์ ์ด ๊ผฌ์ฌ์ conda๋ฅผ ์ด์ฉํ๋ค.
conda create -n ov python=3.9
conda activate ov
pip install -r requirements.txt
* condaํ๊ฒฝ์์ ffmpeg๊ฐ ์๋ค๊ณ ๋ฌ๋ค๋ฉด ์๋์ฒ๋ผ ๊ผญ ffmpeg๋ฅผ ์ค์นํด์ค์ผ ํ๋ค.
conda install ffmpeg
* cpu๋ก ๋๋ฆฌ๊ธฐ - cuda๋ฅผ ์ฌ์ฉํ๋ค๋ฉด pass!
- ํ์ฌ open voice์์๋ cpu๋ฅผ ์ฌ์ฉํ ์๊ฐ ์๋ค. ์ฝ๋๊ฐ ๋๋ฝ๋์ด์๊ณ ์ด ๋ถ๋ถ์ ์์ ํด์ฃผ๊ณ ์์ง ์์์ ์ง์ ์์ ํด์ค์ผ ํ๋ค.
๋จผ์ se_extractor.py ํ์ผ๋ก ๊ฐํ 22๋ฒ์งธ ์ค์ ์๋ ์ฝ๋๋ฅผ
device = "cuda" if torch.cuda.is_available() else "cpu"
model = WhisperModel(model_size, device=device, compute_type="float16")
์๋์ ๊ฐ์ด ๋ฐ๊ฟ์ค๋ค.
device, compute_type = ("cuda","float16") if torch.cuda.is_available() else ("cpu", "int8")
model = WhisperModel(model_size, device=device, compute_type=compute_type)
* ๋ค ๋๋ค. ์ด์ ๋๋ ค๋ณด์. ํ๊ตญ์ด๋ฅผ ์ฝ์ด๋ณด๊ฒ ํ๋ค. ์์ด๋ฅผ ์ฌ์ฉํ๊ณ ์ถ๋ค๋ฉด ์๋ ์ฃผ์์ ํ๋ฉด๋๋ค.
import os
import torch
from openvoice import se_extractor
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
ckpt_base = 'checkpoints/base_speakers/EN'
ckpt_converter = 'checkpoints/converter'
device="cuda:0" if torch.cuda.is_available() else "cpu"
output_dir = 'outputs'
base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
os.makedirs(output_dir, exist_ok=True)
source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)
# reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone
reference_speaker = 'resources/lympe.mp3' # This is the voice you want to clone
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)
# inference
save_path = f'{output_dir}/output_en_default.wav'
# Run the base speaker tts
# text = "This audio is generated by OpenVoice."
text = "์๋
ํ์ธ์! ์ค๋์ ๋ ์จ๊ฐ ์ ๋ง ์ข๋ค์."
src_path = f'{output_dir}/tmp.wav'
base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)
# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path,
message=encode_message)
source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)
save_path = f'{output_dir}/output_whispering.wav'
# Run the base speaker tts
# text = "This audio is generated by OpenVoice."
text = "์๋
ํ์ธ์! ์ค๋์ ๋ ์จ๊ฐ ์ ๋ง ์ข๋ค์."
src_path = f'{output_dir}/tmp.wav'
base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)
# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path,
message=encode_message)
ckpt_base = 'checkpoints/base_speakers/ZH'
base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')
source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)
save_path = f'{output_dir}/output_chinese.wav'
# Run the base speaker tts
# text = "ไปๅคฉๅคฉๆฐ็ๅฅฝ๏ผๆไปฌไธ่ตทๅบๅปๅ้ฅญๅงใ"
text = "์๋
ํ์ธ์! ์ค๋์ ๋ ์จ๊ฐ ์ ๋ง ์ข๋ค์."
src_path = f'{output_dir}/tmp.wav'
base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)
# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path,
message=encode_message)
* ๊ฒฐ๋ก
V1์ ์๋์ฐ๋ ๋งฅ, ๋ชจ๋์์ ์ ๋์๊ฐ๋ค. ์์ด ์ฑ๋ฅ์ v1๋ ์ถฉ๋ถํ ์ข์๋ค.
๋ค์ ํฌ์คํธ์์ ์ ๋ฆฌํ V2๋ ํ๊ตญ์ด์ ๊ฒฝ์ฐ cuda ํ๊ฒฝ์์๋ง ๊ฐ๋ฅํ๊ณ - ์์ด๋ ์ค๊ตญ์ด๋ ์ฌ์ ํ cpu์์ ๋์๊ฐ๋ค - ์ฌ๊ธฐ์ ๋ชฉ์๋ฆฌ ํธ๋ ์ด๋๋ ๊ฐ๋ฅํ๋ค. ๋ฌผ๋ก ์ฑ๋ฅ์ ๊ทธ๋ค์ง ๋ง์กฑ์ค๋ฝ์ง ์์ง๋ง ์ฌ๋ฌ ํ
์คํธ๋ฅผ ํด๋ณด๋ ์ด๋ค ๋ชฉ์๋ฆฌ๋ ๊ฝค๋ ์ ๋ณต์ ํด๋๋ค.
์์ธํ ์ฌํญ์ V2์ ๋จ๊ธฐ๊ฒ ๋ค.
'AI ์์ฑ' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
xtts-webui๋ก coqui ์ค์นํ๊ธฐ (0) | 2024.07.01 |
---|---|
coqui tts(xtts) v2 ์ฌ์ฉ๊ธฐ ์ ๋ฆฌ (0) | 2024.05.17 |
xtts๊ฐ ํจ์ฌ ์ข๋ค (1) | 2024.05.11 |
[Whisper-WebUI] ์๋ ์๋ง ์์ฑ ๋ฐ ์ถ์ถ & ๋ฒ์ญ๊น์ง ํ๋ฒ์ (2) | 2023.06.27 |