我的Jupyter Notebook中的相位声码器发出噪音并且结果不正确
我正在尝试写一个Python类,里面有一些方法可以让我编辑音频。这里提到的方法是想用一个相位声码器来改变音调。我知道有librosa声码器和其他很多工具,但我想自己动手做。这个声码器在音调不变的时候工作得很好,但一旦我尝试改变音调,它就变得很卡,听起来很奇怪。我正在尝试使用Jentgent的方法,具体可以在这里找到:https://github.com/JentGent/pitch-shift/blob/main/audios.ipynb。
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
from scipy.io.wavfile import write
import scipy.signal
import IPython
from IPython.display import Audio
class Audio_object:
def __init__(self, file, mono=True):
self.samplerate, self.data = wavfile.read(file)
if mono and self.data.ndim > 1:
self.data = np.mean(self.data, axis=1).astype(self.data.dtype)
def writefile(self, name):
filename = name + ".wav"
write(filename, self.samplerate, self.data.astype(np.int16))
def stft(self, window='hann', seg_ratio= 0.1, overlap=0.5):
nperseg = self.samplerate*seg_ratio
f, t, Zxx = scipy.signal.stft(self.data, fs=self.samplerate, window=window, nperseg=nperseg, noverlap=overlap)
return f, t, Zxx
def inverse_stft(self, Zxx, window='hann',seg_ratio= 0.1, overlap = 0.5):
nperseg = self.samplerate*seg_ratio
_, x_rec = scipy.signal.istft(Zxx, fs=self.samplerate, window=window, nperseg=nperseg, noverlap=overlap)
return x_rec
def circle(self, name):
filename = name + ".wav"
_, _, Zxx = self.stft()
x_rec = self.inverse_stft(Zxx)
write(filename, self.samplerate, x_rec.astype(np.int16))
def plot(self, data,title):
plt.plot(data)
plt.xlabel("Sample Index")
plt.ylabel("Amplitude")
plt.title(title)
def plot_fft(self):
f, _, Zxx = self.stft()
avg_spectrum = np.mean(np.abs(Zxx), axis=1) # taking the mean of the magnitude across the spectrum,
plt.figure(figsize=(10, 5)) # this is not done for further use, only for visual representation
plt.plot(f, avg_spectrum)
plt.title('Average FFT Magnitude Spectrum')
plt.xlabel('Frequency [Hz]')
plt.ylabel('Magnitude')
plt.grid(True)
plt.show()
def plot_spectrogram(self, f, t, Zxx):
plt.figure(figsize=(10, 5))
plt.pcolormesh(t, f, np.abs(Zxx), shading='gouraud')
plt.title('Spectrogram')
plt.xlabel('Time [s]')
plt.ylabel('Frequency [Hz]')
plt.ylim(0,3500)
plt.show()
@staticmethod
def interpolate_time(idxs, arr):
start = (idxs + 0.5).astype(int)
frac = (idxs - start)[None, None, :]
shifted_arr = np.concatenate((arr[:, 1:], np.zeros((arr.shape[0], arr.shape[1]))), axis=1)
return arr[:, start] * (1 - frac) + shifted_arr[:, :, start] * frac
@staticmethod
def interpolate_time(idxs, arr):
start = np.minimum((idxs + 0.5).astype(int), arr.shape[1]-1)
frac = (idxs - start)[None, None, :]
shifted_arr = np.concatenate((arr[:, 1:], np.zeros((arr.shape[0], arr.shape[1]))), axis=1)
return arr[:, start] * (1 - frac) + shifted_arr[ :, start] * frac
def pitch_shift(self, semitones, seg_ratio=0.1, overlap=0.5):
scaling = 2 ** (semitones / 12)
f,t,Zxx = self.stft()
anls_frames = np.arange(len(Zxx))
n_synth_frames = np.floor(len(Zxx) * scaling).astype(int)
synth_frames = np.arange(n_synth_frames)
og_idxs = np.minimum(synth_frames / scaling,len(Zxx) - 1)
mags = np.abs(Zxx)
phases = np.angle(Zxx)
#print((np.zeros((len(Zxx), 1)), phases[:, :-1]))
phase_diffs = phases - np.concatenate((np.zeros(( len(Zxx), 1)), phases[:, :-1]), axis=1)
phase_diffs = np.mod(phase_diffs, np.pi * 2)
shifted_mags = self.interpolate_time(og_idxs, mags)
shifted_phase_diffs = self.interpolate_time(og_idxs, phase_diffs)
shifted_phases = np.cumsum(shifted_phase_diffs, axis=2)
synth_stft = shifted_mags * np.exp(shifted_phases * 1j)
new_waveform = self.inverse_stft(synth_stft.astype(np.complex64))
return Audio(new_waveform.astype(np.int16), rate=int(self.samplerate*scaling))
gettysburg = Audio_object("gettysburg.wav", mono=True)
audio_widget = gettysburg.pitch_shift(12)
audio_widget
我一直在尝试调整一些变量,但STFT数组对我来说有点复杂。
0 个回答
暂无回答