我在这里运行教程中的代码:
https://keras.io/examples/audio/speaker_recognition_using_cnn/
使用自定义数据集,该数据集被划分为两个数据集,如教程中所示。然而,我得到了这个错误:
TypeError: Input 'filename' of 'ReadFile' Op has type float32 that does not match expected type of string.
col实验室链接https://colab.research.google.com/drive/1bBplixcAu6iCMfQ4njNeByozjet3GkNf#scrollTo=5SGb0kVC3JGW
代码
def paths_and_labels_to_dataset(audio_paths, labels):
"""Constructs a dataset of audios and labels."""
path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
audio_ds = path_ds.map(lambda x: path_to_audio(x))
label_ds = tf.data.Dataset.from_tensor_slices(labels)
return tf.data.Dataset.zip((audio_ds, label_ds))
def path_to_audio(path):
"""Reads and decodes an audio file."""
audio = tf.io.read_file(path)
audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
return audio
def add_noise(audio, noises=None, scale=0.5):
if noises is not None:
# Create a random tensor of the same size as audio ranging from
# 0 to the number of noise stream samples that we have.
tf_rnd = tf.random.uniform(
(tf.shape(audio)[0],), 0, noises.shape[0], dtype=tf.int32
)
noise = tf.gather(noises, tf_rnd, axis=0)
# Get the amplitude proportion between the audio and the noise
prop = tf.math.reduce_max(audio, axis=1) / tf.math.reduce_max(noise, axis=1)
prop = tf.repeat(tf.expand_dims(prop, axis=1), tf.shape(audio)[1], axis=1)
# Adding the rescaled noise to audio
audio = audio + noise * prop * scale
return audio
def audio_to_fft(audio):
# Since tf.signal.fft applies FFT on the innermost dimension,
# we need to squeeze the dimensions and then expand them again
# after FFT
audio = tf.squeeze(audio, axis=-1)
fft = tf.signal.fft(
tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
)
fft = tf.expand_dims(fft, axis=-1)
# Return the absolute value of the first half of the FFT
# which represents the positive frequencies
return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])
# Get the list of audio file paths along with their corresponding labels
class_names = os.listdir(DATASET_AUDIO_PATH)
print("Our class names: {}".format(class_names,))
audio_paths = []
labels = []
for label, name in enumerate(class_names):
print("Processing speaker {}".format(name,))
dir_path = Path(DATASET_AUDIO_PATH) / name
speaker_sample_paths = [
os.path.join(dir_path, filepath)
for filepath in os.listdir(dir_path)
if filepath.endswith(".wav")
]
audio_paths += speaker_sample_paths
labels += [label] * len(speaker_sample_paths)
print(
"Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names))
)
# Shuffle
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(audio_paths)
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(labels)
# Split into training and validation
num_val_samples = int(VALID_SPLIT * len(audio_paths))
print("Using {} files for training.".format(len(audio_paths) - num_val_samples))
train_audio_paths = audio_paths[:-num_val_samples]
train_labels = labels[:-num_val_samples]
print("Using {} files for validation.".format(num_val_samples))
valid_audio_paths = audio_paths[-num_val_samples:]
valid_labels = labels[-num_val_samples:]
# Create 2 datasets, one for training and the other for validation
train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels)
train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
BATCH_SIZE
)
valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)
# Add noise to the training set
train_ds = train_ds.map(
lambda x, y: (add_noise(x, noises, scale=SCALE), y),
num_parallel_calls=tf.data.AUTOTUNE,
)
# Transform audio wave to the frequency domain using `audio_to_fft`
train_ds = train_ds.map(
lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
valid_ds = valid_ds.map(
lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)
目前没有回答
相关问题 更多 >
编程相关推荐