Semi-Supervised Speaker Adaptation for End-to-End Speech Synthesis with Pretrained Models

التفاصيل البيبلوغرافية
العنوان: Semi-Supervised Speaker Adaptation for End-to-End Speech Synthesis with Pretrained Models
المؤلفون: Sunao Hara, Katsuki Inoue, Shinji Watanabe, Ryuichi Yamamoto, Masanobu Abe, Tomoki Hayashi
المصدر: ICASSP
بيانات النشر: IEEE, 2020.
سنة النشر: 2020
مصطلحات موضوعية: Similarity (geometry), Computer science, Speech recognition, Speech synthesis, 010501 environmental sciences, computer.software_genre, 01 natural sciences, Pipeline (software), 030507 speech-language pathology & audiology, 03 medical and health sciences, End-to-end principle, Transcription (linguistics), 0305 other medical science, computer, 0105 earth and related environmental sciences, Speaker adaptation
الوصف: Recently, end-to-end text-to-speech (TTS) models have achieved a remarkable performance, however, requiring a large amount of paired text and speech data for training. On the other hand, we can easily collect unpaired dozen minutes of speech recordings for a target speaker without corresponding text data. To make use of such accessible data, the proposed method leverages the recent great success of state-of-the-art end-to-end automatic speech recognition (ASR) systems and obtains corresponding transcriptions from pretrained ASR models. Although these models could only provide text output instead of intermediate linguistic features like phonemes, end-to-end TTS can be well trained with such raw text data directly. Thus, the proposed method can greatly simplify a speaker adaptation pipeline by consistently employing end-to-end ASR/TTS ecosystems. The experimental results show that our proposed method achieved comparable performance to a paired data adaptation method in terms of subjective speaker similarity and objective cepstral distance measures.
DOI: 10.1109/icassp40776.2020.9053371
URL الوصول: https://explore.openaire.eu/search/publication?articleId=doi_________::858c159b6af41c659715109383e20628
https://doi.org/10.1109/icassp40776.2020.9053371
Rights: CLOSED
رقم الانضمام: edsair.doi...........858c159b6af41c659715109383e20628
قاعدة البيانات: OpenAIRE
ResultId 1
Header edsair
OpenAIRE
edsair.doi...........858c159b6af41c659715109383e20628
845
3

unknown
844.669555664063
PLink https://search.ebscohost.com/login.aspx?direct=true&site=eds-live&scope=site&db=edsair&AN=edsair.doi...........858c159b6af41c659715109383e20628&custid=s6537998&authtype=sso
FullText Array ( [Availability] => 0 )
Array ( [0] => Array ( [Url] => https://explore.openaire.eu/search/publication?articleId=doi_________::858c159b6af41c659715109383e20628# [Name] => EDS - OpenAIRE [Category] => fullText [Text] => View record in OpenAIRE [MouseOverText] => View record in OpenAIRE ) )
Items Array ( [Name] => Title [Label] => Title [Group] => Ti [Data] => Semi-Supervised Speaker Adaptation for End-to-End Speech Synthesis with Pretrained Models )
Array ( [Name] => Author [Label] => Authors [Group] => Au [Data] => <searchLink fieldCode="AR" term="%22Sunao+Hara%22">Sunao Hara</searchLink><br /><searchLink fieldCode="AR" term="%22Katsuki+Inoue%22">Katsuki Inoue</searchLink><br /><searchLink fieldCode="AR" term="%22Shinji+Watanabe%22">Shinji Watanabe</searchLink><br /><searchLink fieldCode="AR" term="%22Ryuichi+Yamamoto%22">Ryuichi Yamamoto</searchLink><br /><searchLink fieldCode="AR" term="%22Masanobu+Abe%22">Masanobu Abe</searchLink><br /><searchLink fieldCode="AR" term="%22Tomoki+Hayashi%22">Tomoki Hayashi</searchLink> )
Array ( [Name] => TitleSource [Label] => Source [Group] => Src [Data] => ICASSP )
Array ( [Name] => Publisher [Label] => Publisher Information [Group] => PubInfo [Data] => IEEE, 2020. )
Array ( [Name] => DatePubCY [Label] => Publication Year [Group] => Date [Data] => 2020 )
Array ( [Name] => Subject [Label] => Subject Terms [Group] => Su [Data] => <searchLink fieldCode="DE" term="%22Similarity+%28geometry%29%22">Similarity (geometry)</searchLink><br /><searchLink fieldCode="DE" term="%22Computer+science%22">Computer science</searchLink><br /><searchLink fieldCode="DE" term="%22Speech+recognition%22">Speech recognition</searchLink><br /><searchLink fieldCode="DE" term="%22Speech+synthesis%22">Speech synthesis</searchLink><br /><searchLink fieldCode="DE" term="%22010501+environmental+sciences%22">010501 environmental sciences</searchLink><br /><searchLink fieldCode="DE" term="%22computer%2Esoftware%5Fgenre%22">computer.software_genre</searchLink><br /><searchLink fieldCode="DE" term="%2201+natural+sciences%22">01 natural sciences</searchLink><br /><searchLink fieldCode="DE" term="%22Pipeline+%28software%29%22">Pipeline (software)</searchLink><br /><searchLink fieldCode="DE" term="%22030507+speech-language+pathology+%26+audiology%22">030507 speech-language pathology & audiology</searchLink><br /><searchLink fieldCode="DE" term="%2203+medical+and+health+sciences%22">03 medical and health sciences</searchLink><br /><searchLink fieldCode="DE" term="%22End-to-end+principle%22">End-to-end principle</searchLink><br /><searchLink fieldCode="DE" term="%22Transcription+%28linguistics%29%22">Transcription (linguistics)</searchLink><br /><searchLink fieldCode="DE" term="%220305+other+medical+science%22">0305 other medical science</searchLink><br /><searchLink fieldCode="DE" term="%22computer%22">computer</searchLink><br /><searchLink fieldCode="DE" term="%220105+earth+and+related+environmental+sciences%22">0105 earth and related environmental sciences</searchLink><br /><searchLink fieldCode="DE" term="%22Speaker+adaptation%22">Speaker adaptation</searchLink> )
Array ( [Name] => Abstract [Label] => Description [Group] => Ab [Data] => Recently, end-to-end text-to-speech (TTS) models have achieved a remarkable performance, however, requiring a large amount of paired text and speech data for training. On the other hand, we can easily collect unpaired dozen minutes of speech recordings for a target speaker without corresponding text data. To make use of such accessible data, the proposed method leverages the recent great success of state-of-the-art end-to-end automatic speech recognition (ASR) systems and obtains corresponding transcriptions from pretrained ASR models. Although these models could only provide text output instead of intermediate linguistic features like phonemes, end-to-end TTS can be well trained with such raw text data directly. Thus, the proposed method can greatly simplify a speaker adaptation pipeline by consistently employing end-to-end ASR/TTS ecosystems. The experimental results show that our proposed method achieved comparable performance to a paired data adaptation method in terms of subjective speaker similarity and objective cepstral distance measures. )
Array ( [Name] => DOI [Label] => DOI [Group] => ID [Data] => 10.1109/icassp40776.2020.9053371 )
Array ( [Name] => URL [Label] => Access URL [Group] => URL [Data] => <link linkTarget="URL" linkTerm="https://explore.openaire.eu/search/publication?articleId=doi_________::858c159b6af41c659715109383e20628" linkWindow="_blank">https://explore.openaire.eu/search/publication?articleId=doi_________::858c159b6af41c659715109383e20628</link><br /><link linkTarget="URL" linkTerm="https://doi.org/10.1109/icassp40776.2020.9053371" linkWindow="_blank">https://doi.org/10.1109/icassp40776.2020.9053371</link> )
Array ( [Name] => Copyright [Label] => Rights [Group] => Cpyrght [Data] => CLOSED )
Array ( [Name] => AN [Label] => Accession Number [Group] => ID [Data] => edsair.doi...........858c159b6af41c659715109383e20628 )
RecordInfo Array ( [BibEntity] => Array ( [Identifiers] => Array ( [0] => Array ( [Type] => doi [Value] => 10.1109/icassp40776.2020.9053371 ) ) [Languages] => Array ( [0] => Array ( [Text] => Undetermined ) ) [Subjects] => Array ( [0] => Array ( [SubjectFull] => Similarity (geometry) [Type] => general ) [1] => Array ( [SubjectFull] => Computer science [Type] => general ) [2] => Array ( [SubjectFull] => Speech recognition [Type] => general ) [3] => Array ( [SubjectFull] => Speech synthesis [Type] => general ) [4] => Array ( [SubjectFull] => 010501 environmental sciences [Type] => general ) [5] => Array ( [SubjectFull] => computer.software_genre [Type] => general ) [6] => Array ( [SubjectFull] => 01 natural sciences [Type] => general ) [7] => Array ( [SubjectFull] => Pipeline (software) [Type] => general ) [8] => Array ( [SubjectFull] => 030507 speech-language pathology & audiology [Type] => general ) [9] => Array ( [SubjectFull] => 03 medical and health sciences [Type] => general ) [10] => Array ( [SubjectFull] => End-to-end principle [Type] => general ) [11] => Array ( [SubjectFull] => Transcription (linguistics) [Type] => general ) [12] => Array ( [SubjectFull] => 0305 other medical science [Type] => general ) [13] => Array ( [SubjectFull] => computer [Type] => general ) [14] => Array ( [SubjectFull] => 0105 earth and related environmental sciences [Type] => general ) [15] => Array ( [SubjectFull] => Speaker adaptation [Type] => general ) ) [Titles] => Array ( [0] => Array ( [TitleFull] => Semi-Supervised Speaker Adaptation for End-to-End Speech Synthesis with Pretrained Models [Type] => main ) ) ) [BibRelationships] => Array ( [HasContributorRelationships] => Array ( [0] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => Sunao Hara ) ) ) [1] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => Katsuki Inoue ) ) ) [2] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => Shinji Watanabe ) ) ) [3] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => Ryuichi Yamamoto ) ) ) [4] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => Masanobu Abe ) ) ) [5] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => Tomoki Hayashi ) ) ) ) [IsPartOfRelationships] => Array ( [0] => Array ( [BibEntity] => Array ( [Dates] => Array ( [0] => Array ( [D] => 01 [M] => 05 [Type] => published [Y] => 2020 ) ) [Identifiers] => Array ( [0] => Array ( [Type] => issn-locals [Value] => edsair ) ) [Titles] => Array ( [0] => Array ( [TitleFull] => ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) [Type] => main ) ) ) ) ) ) )
IllustrationInfo