A differentiable version of SPTK
diffsptk is a differentiable version of SPTK based on the PyTorch framework.
The latest stable release can be installed through PyPI by running
pip install diffsptk
The development release can be installed from the master branch:
pip install git+https://github.com/sp-nitech/diffsptk.git@master
import diffsptk
fl = 400 # Frame length.
fp = 80 # Frame period.
n_fft = 512 # FFT length.
M = 24 # Mel-cepstrum dimensions.
# Read waveform.
x, sr = diffsptk.read("assets/data.wav")
# Compute STFT amplitude of x.
stft = diffsptk.STFT(frame_length=fl, frame_period=fp, fft_length=n_fft)
X = stft(x)
# Estimate mel-cepstrum of x.
alpha = diffsptk.get_alpha(sr)
mcep = diffsptk.MelCepstralAnalysis(cep_order=M, fft_length=n_fft, alpha=alpha, n_iter=10)
mc = mcep(X)
# Reconstruct x.
mlsa = diffsptk.MLSA(filter_order=M, frame_period=fp, alpha=alpha, taylor_order=30)
x_hat = mlsa(mlsa(x, -mc), mc)
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
# Extract pitch of x.
pitch = diffsptk.Pitch(frame_period=fp, sample_rate=sr, f_min=80, f_max=180)
p = pitch(x)
# Generate excitation signal.
excite = diffsptk.ExcitationGeneration(frame_period=fp)
e = excite(p)
n = diffsptk.nrand(x.size(0) - 1)
# Synthesize waveform.
x_voiced = mlsa(e, mc)
x_unvoiced = mlsa(n, mc)
# Output analysis-synthesis result.
diffsptk.write("voiced.wav", x_voiced, sr)
diffsptk.write("unvoiced.wav", x_unvoiced, sr)
import diffsptk
fl = 400 # Frame length
fp = 80 # Frame period
n_fft = 512 # FFT length
n_channel = 80 # Number of channels
M = 12 # MFCC/PLP dimensions
# Read waveform.
x, sr = diffsptk.read("assets/data.wav")
# Compute STFT amplitude of x.
stft = diffsptk.STFT(frame_length=fl, frame_period=fp, fft_length=n_fft)
X = stft(x)
# Extract mel-spectrogram.
fbank = diffsptk.MelFilterBankAnalysis(
n_channel=n_channel,
fft_length=n_fft,
sample_rate=sr,
)
Y = fbank(X)
print(Y.shape)
# Extract MFCC.
mfcc = diffsptk.MFCC(
mfcc_order=M,
n_channel=n_channel,
fft_length=n_fft,
sample_rate=sr,
)
Y = mfcc(X)
print(Y.shape)
# Extract PLP.
plp = diffsptk.PLP(
plp_order=M,
n_channel=n_channel,
fft_length=n_fft,
sample_rate=sr,
)
Y = plp(X)
print(Y.shape)
import diffsptk
K = 4 # Number of subbands.
M = 40 # Order of filter.
# Read waveform.
x, sr = diffsptk.read("assets/data.wav")
# Decompose x.
pqmf = diffsptk.PQMF(K, M)
decimate = diffsptk.Decimation(K)
y = decimate(pqmf(x), dim=-1)
# Reconstruct x.
interpolate = diffsptk.Interpolation(K)
ipqmf = diffsptk.IPQMF(K, M)
x_hat = ipqmf(interpolate(K * y, dim=-1)).reshape(-1)
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
import diffsptk
import librosa # This is to get sample audio.
fp = 128 # Frame period.
K = 252 # Number of CQ-bins.
B = 36 # Number of bins per octave.
# Read waveform.
x, sr = diffsptk.read(librosa.ex("trumpet"))
# Transform x.
cqt = diffsptk.CQT(fp, sr, n_bin=K, n_bin_per_octave=B)
c = cqt(x)
# Reconstruct x.
icqt = diffsptk.ICQT(fp, sr, n_bin=K, n_bin_per_octave=B)
x_hat = icqt(c, out_length=x.size(0))
# Write reconstructed waveform.
diffsptk.write("reconst.wav", x_hat, sr)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
import diffsptk
K = 2 # Codebook size.
M = 4 # Order of vector.
# Prepare input.
x = diffsptk.nrand(M)
# Quantize x.
vq = diffsptk.VectorQuantization(M, K)
x_hat, indices, commitment_loss = vq(x)
# Compute error.
error = (x_hat - x).abs().sum()
print(error)
This software is released under the Apache License 2.0.
@InProceedings{sp-nitech2023sptk,
author = {Takenori Yoshimura and Takato Fujimoto and Keiichiro Oura and Keiichi Tokuda},
title = {{SPTK4}: An open-source software toolkit for speech signal processing},
booktitle = {12th ISCA Speech Synthesis Workshop (SSW 2023)},
pages = {211--217},
year = {2023},
}