I've written a small patch for eSpeak NG which adds a singing synthesis mode similar to the singing-mode.scm included in Festival and used by the Singing Computer[1]. The patch adds two options for setting the note pitch in Hertz or as a midi note number. When singing mode is selected, eSpeak NG outputs the needed information for transforming speech to singing voice.
[1] https://freebsoft.org/singing-computer -- Tobias Platen <tpla...@posteo.de>
diff --git a/src/espeak-ng.c b/src/espeak-ng.c index 1634cc1d..d54e0ce8 100644 --- a/src/espeak-ng.c +++ b/src/espeak-ng.c @@ -324,6 +324,7 @@ int main(int argc, char **argv) { "compile-intonations", no_argument, 0, 0x10f }, { "compile-phonemes", optional_argument, 0, 0x110 }, { "load", no_argument, 0, 0x111 }, + { "utau-note", required_argument, 0, 0x112 }, { 0, 0, 0, 0 } }; @@ -368,7 +369,7 @@ int main(int argc, char **argv) option_punctlist[0] = 0; while (true) { - c = getopt_long(argc, argv, "a:b:d:f:g:hk:l:mp:qs:v:w:xXz", + c = getopt_long(argc, argv, "a:b:d:e:f:g:hk:l:mp:qs:v:u:w:xXz", long_options, &option_index); // Detect the end of the options. @@ -426,6 +427,12 @@ int main(int argc, char **argv) case 'g': wordgap = atoi(optarg2); break; + case 'e': + espeak_sg_SetUtauNoteFreq(atoi(optarg2)); + break; + case 'u': + espeak_sg_SetUtauNote(atoi(optarg2)); + break; case 'v': strncpy0(voicename, optarg2, sizeof(voicename)); break; diff --git a/src/include/espeak-ng/speak_lib.h b/src/include/espeak-ng/speak_lib.h index cc1e35e7..fba10c4f 100644 --- a/src/include/espeak-ng/speak_lib.h +++ b/src/include/espeak-ng/speak_lib.h @@ -468,6 +468,9 @@ ESPEAK_API espeak_ERROR espeak_SetParameter(espeak_PARAMETER parameter, int valu EE_INTERNAL_ERROR. */ +ESPEAK_API void espeak_sg_SetUtauNote(int value); +ESPEAK_API void espeak_sg_SetUtauNoteFreq(int value); + #ifdef __cplusplus extern "C" #endif diff --git a/src/libespeak-ng/speech.c b/src/libespeak-ng/speech.c index b415e537..ccf7bedb 100644 --- a/src/libespeak-ng/speech.c +++ b/src/libespeak-ng/speech.c @@ -796,6 +796,18 @@ ESPEAK_API int espeak_GetParameter(espeak_PARAMETER parameter, int current) return param_defaults[parameter]; } +extern int utau_pitch; + +ESPEAK_API void espeak_sg_SetUtauNote(int value) +{ + utau_pitch = pow(2.0, (value - 69) / 12.0) * 440; +} + +ESPEAK_API void espeak_sg_SetUtauNoteFreq(int value) +{ + utau_pitch = value; +} + ESPEAK_NG_API espeak_ng_STATUS espeak_ng_SetParameter(espeak_PARAMETER parameter, int value, int relative) { #ifdef USE_ASYNC diff --git a/src/libespeak-ng/synthesize.c b/src/libespeak-ng/synthesize.c index b274ae42..eb4583e2 100644 --- a/src/libespeak-ng/synthesize.c +++ b/src/libespeak-ng/synthesize.c @@ -223,6 +223,15 @@ static void DoPause(int length, int control) } } + +static void DoOto(char* oto, int type) +{ + wcmdq[wcmdq_tail][0] = WCMD_OTO; + wcmdq[wcmdq_tail][1] = oto; + wcmdq[wcmdq_tail][2] = type; + WcmdqInc(); +} + extern int seq_len_adjust; // temporary fix to advance the start point for playing the wav sample static int DoSample2(int index, int which, int std_length, int control, int length_mod, int amp) @@ -1131,6 +1140,8 @@ void DoEmbedded(int *embix, int sourceix) } while ((word & 0x80) == 0); } +extern int utau_pitch; + int Generate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume) { static int ix; @@ -1187,6 +1198,14 @@ int Generate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume) while ((ix < (*n_ph)) && (ix < N_PHONEME_LIST-2)) { p = &phoneme_list[ix]; + + if(utau_pitch) + { + char ecantorix_debug_buf[30]; + int ecantorix_debug_flags=0; + WritePhMnemonic(ecantorix_debug_buf, p->ph, p, 0, &ecantorix_debug_flags); + DoOto(strdup(ecantorix_debug_buf),p->type); + } if (p->type == phPAUSE) free_min = 10; @@ -1236,6 +1255,8 @@ int Generate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume) } } + //emit oto here + switch (p->type) { case phPAUSE: diff --git a/src/libespeak-ng/synthesize.h b/src/libespeak-ng/synthesize.h index d7760f74..f9d32bd6 100644 --- a/src/libespeak-ng/synthesize.h +++ b/src/libespeak-ng/synthesize.h @@ -433,6 +433,7 @@ extern unsigned char pitch_adjust_tab[MAX_PITCH_VALUE+1]; #define WCMD_MBROLA_DATA 13 #define WCMD_FMT_AMPLITUDE 14 #define WCMD_SONIC_SPEED 15 +#define WCMD_OTO 16 #define N_WCMDQ 170 #define MIN_WCMDQ 25 // need this many free entries before adding new phoneme diff --git a/src/libespeak-ng/wavegen.c b/src/libespeak-ng/wavegen.c index 13efb743..d3de98bd 100644 --- a/src/libespeak-ng/wavegen.c +++ b/src/libespeak-ng/wavegen.c @@ -53,6 +53,8 @@ voice_t *wvoice = NULL; +int utau_pitch=0; + FILE *f_log = NULL; static int option_harmonic1 = 10; static int flutter_amp = 64; @@ -324,6 +326,18 @@ static unsigned char pk_shape2[PEAKSHAPEW+1] = { static unsigned char *pk_shape; +static void printUtauHeader() +{ + if(utau_pitch==0) return; + static int done=0; + if(done==0) { + printf("SG_SAMPLERATE %i\n",samplerate); + float period = samplerate*1.0/utau_pitch; + printf("SG_PERIOD %f\n",period); + } + done=1; +} + void WavegenInit(int rate, int wavemult_fact) { int ix; @@ -530,6 +544,7 @@ int PeaksToHarmspect(wavegen_peaks_t *peaks, int pitch, int *htab, int control) return hmax; // highest harmonic number } + static void AdvanceParameters() { // Called every 64 samples to increment the formant freq, height, and widths @@ -554,6 +569,12 @@ static void AdvanceParameters() x = ((int)(Flutter_tab[Flutter_ix >> 6])-0x80) * flutter_amp; Flutter_ix += Flutter_inc; wdata.pitch += x; + + if(utau_pitch) + { + wdata.pitch = (utau_pitch<<12) + x; + } + if (wdata.pitch < 102400) wdata.pitch = 102400; // min pitch, 25 Hz (25 << 12) @@ -671,6 +692,8 @@ static int ApplyBreath(void) return value; } + + static int Wavegen() { if (wvoice == NULL) @@ -788,7 +811,7 @@ static int Wavegen() modn_period = modn_period >> 4; } - if (modn_period != 0) { + if (modn_period != 0 && utau_pitch==0) { if (modn_period == 0xf) { // just once */ amplitude2 = (amplitude2 * modn_amp)/16; @@ -826,6 +849,7 @@ static int Wavegen() total += AddSineWaves(waveph, h_switch_sign, maxh, harmspect); // call an assembler code routine #else theta = waveph; + for (h = 1; h <= h_switch_sign; h++) { total += ((int)sin_tab[theta >> 5] * harmspect[h]); @@ -863,12 +887,20 @@ static int Wavegen() wdata.mix_wavefile_offset -= (wdata.mix_wavefile_max*3)/4; } - z1 = z2 + (((total>>8) * amplitude2) >> 13); + int z3 = (((total>>8) * amplitude2) >> 13); + + z1 = z2 + z3; //mixed = unvoiced + voiced + + echo = (echo_buf[echo_tail++] * echo_amp); z1 += echo >> 8; if (echo_tail >= N_ECHO_BUF) echo_tail = 0; + + printUtauHeader(); + if(utau_pitch) printf("SG_V %i %i\n",z3,z2); //we have more bits in text files + z = (z1 * agc) >> 8; @@ -884,6 +916,7 @@ static int Wavegen() } *out_ptr++ = z; *out_ptr++ = z >> 8; + echo_buf[echo_head++] = z; if (echo_head >= N_ECHO_BUF) @@ -915,6 +948,8 @@ static int PlaySilence(int length, bool resume) if (echo_tail >= N_ECHO_BUF) echo_tail = 0; + printUtauHeader(); + if(utau_pitch) printf("SG_S %i\n",value); *out_ptr++ = value; *out_ptr++ = value >> 8; @@ -967,6 +1002,8 @@ static int PlayWave(int length, bool resume, unsigned char *data, int scale, int if (echo_tail >= N_ECHO_BUF) echo_tail = 0; + printUtauHeader(); + if(utau_pitch) printf("SG_U %i\n",value); out_ptr[0] = value; out_ptr[1] = value >> 8; out_ptr += 2; @@ -1281,6 +1318,15 @@ static int WavegenFill2() switch (q[0] & 0xff) { + case WCMD_OTO: + { + char* data = (char*)q[1]; + printUtauHeader(); + char* ototypes[]= {"PAUSE","STRESS","VOWEL","LIQUID","STOP","VSTOP","FRICATIVE","VFRICATIVE","NASAL","VIRTUAL","DELETED","INVALID"}; + printf("SG_OTO %s %s\n",data,ototypes[q[2]]); + free(data); + } + break; case WCMD_PITCH: SetPitch(length, (unsigned char *)q[2], q[3] >> 16, q[3] & 0xffff); break;