我的Jupyter Notebook中的相位声码器发出噪音并且结果不正确

Question

我正在尝试写一个Python类，里面有一些方法可以让我编辑音频。这里提到的方法是想用一个相位声码器来改变音调。我知道有librosa声码器和其他很多工具，但我想自己动手做。这个声码器在音调不变的时候工作得很好，但一旦我尝试改变音调，它就变得很卡，听起来很奇怪。我正在尝试使用Jentgent的方法，具体可以在这里找到：https://github.com/JentGent/pitch-shift/blob/main/audios.ipynb。

import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
from scipy.io.wavfile import write
import scipy.signal
import IPython
from IPython.display import Audio


class Audio_object:
    def __init__(self, file, mono=True):
        self.samplerate, self.data = wavfile.read(file)
        if mono and self.data.ndim > 1:
            self.data = np.mean(self.data, axis=1).astype(self.data.dtype)

        
    def writefile(self, name):
        filename = name + ".wav"
        write(filename, self.samplerate, self.data.astype(np.int16))

    def stft(self, window='hann', seg_ratio= 0.1, overlap=0.5):
        nperseg = self.samplerate*seg_ratio
        f, t, Zxx = scipy.signal.stft(self.data, fs=self.samplerate, window=window, nperseg=nperseg, noverlap=overlap)
        return f, t, Zxx

    def inverse_stft(self, Zxx, window='hann',seg_ratio= 0.1, overlap = 0.5):
        nperseg = self.samplerate*seg_ratio
        _, x_rec = scipy.signal.istft(Zxx, fs=self.samplerate, window=window, nperseg=nperseg, noverlap=overlap)
        return x_rec

    def circle(self, name):
        filename = name + ".wav"
        _, _, Zxx = self.stft()
        x_rec = self.inverse_stft(Zxx)
        write(filename, self.samplerate, x_rec.astype(np.int16))

    def plot(self, data,title):
        plt.plot(data)
        plt.xlabel("Sample Index")
        plt.ylabel("Amplitude")
        plt.title(title)

    
    def plot_fft(self):
        f, _, Zxx = self.stft()
        avg_spectrum = np.mean(np.abs(Zxx), axis=1) # taking the mean of the magnitude across the spectrum,
        plt.figure(figsize=(10, 5))                 # this is not done for further use, only for visual representation
        plt.plot(f, avg_spectrum)
        plt.title('Average FFT Magnitude Spectrum')
        plt.xlabel('Frequency [Hz]')
        plt.ylabel('Magnitude')
        plt.grid(True)
        plt.show()

    def plot_spectrogram(self, f, t, Zxx):
        plt.figure(figsize=(10, 5))
        plt.pcolormesh(t, f, np.abs(Zxx), shading='gouraud')
        plt.title('Spectrogram')
        plt.xlabel('Time [s]')
        plt.ylabel('Frequency [Hz]')
        plt.ylim(0,3500) 
        plt.show()


    @staticmethod
    def interpolate_time(idxs, arr):
        start = (idxs + 0.5).astype(int)
        frac = (idxs - start)[None, None, :]
        shifted_arr = np.concatenate((arr[:, 1:], np.zeros((arr.shape[0], arr.shape[1]))), axis=1)
        return arr[:, start] * (1 - frac) + shifted_arr[:, :, start] * frac
    
    @staticmethod
    def interpolate_time(idxs, arr):
        start = np.minimum((idxs + 0.5).astype(int), arr.shape[1]-1)
        frac = (idxs - start)[None, None, :]
        shifted_arr = np.concatenate((arr[:, 1:], np.zeros((arr.shape[0], arr.shape[1]))), axis=1)
        return arr[:, start] * (1 - frac) + shifted_arr[ :, start] * frac
    
    def pitch_shift(self, semitones, seg_ratio=0.1, overlap=0.5):         
        scaling = 2 ** (semitones / 12)
        f,t,Zxx = self.stft()
        

        anls_frames = np.arange(len(Zxx))
        n_synth_frames = np.floor(len(Zxx) * scaling).astype(int)
        synth_frames = np.arange(n_synth_frames)
        og_idxs = np.minimum(synth_frames / scaling,len(Zxx) - 1)

        mags = np.abs(Zxx)
        phases = np.angle(Zxx)
        #print((np.zeros((len(Zxx), 1)), phases[:, :-1]))
        phase_diffs = phases - np.concatenate((np.zeros(( len(Zxx), 1)), phases[:, :-1]), axis=1)
        phase_diffs = np.mod(phase_diffs, np.pi * 2)


        shifted_mags = self.interpolate_time(og_idxs, mags)
        shifted_phase_diffs = self.interpolate_time(og_idxs, phase_diffs)

        shifted_phases = np.cumsum(shifted_phase_diffs, axis=2)

        synth_stft = shifted_mags * np.exp(shifted_phases * 1j)


        new_waveform = self.inverse_stft(synth_stft.astype(np.complex64))
        return Audio(new_waveform.astype(np.int16), rate=int(self.samplerate*scaling))

gettysburg = Audio_object("gettysburg.wav", mono=True)

audio_widget = gettysburg.pitch_shift(12)
audio_widget

我一直在尝试调整一些变量，但STFT数组对我来说有点复杂。

信号处理音频处理相位声码器音调变化 stft librosa 频域分析噪声抑制

我的Jupyter Notebook中的相位声码器发出噪音并且结果不正确

0 个回答

撰写回答