我的神经网络预测了同样的值

2024-05-29 11:23:15 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图建立1d cnn来对fakenews和它是哪种类型的新闻进行分类。它仍然预测相同的值,我认为这与真实的FLASE有关,因为准确率约为100%,这是不正常的。这是我的密码:

import pandas as pd
import numpy as np
from keras import layers
from keras import models
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras import optimizers
import pickle
import os
import keras
import tensorflow as tf

tf.compat.v1.disable_eager_execution()

real = pd.read_csv("C:\\Users\\Tadeas\\Downloads\\truenews\\True.csv")
fake = pd.read_csv("C:\\Users\\Tadeas\\Downloads\\fakenews\\Fake.csv")


text = []
subject_target = []
truefalse_target = []

rowstrue = 21417
rowsfalse = 23481

for i in range(rowstrue):
  subject_target.append(real["subject"][i])
  text.append(real["text"][i])
  truefalse_target.append("True")


for i in range(rowsfalse):
  subject_target.append(fake["subject"][i])
  text.append(fake["text"][i])
  truefalse_target.append("False")


num_subject = []
num_trueflase = []


for q in subject_target:
  if q == "politicsNews" or q == "politics" or q =="Government News":
    num_subject.append(1)

  else:
    num_subject.append(0)



for i in truefalse_target:
  if i == "True":
    num_trueflase.append(1)

  else:
    num_trueflase.append(0)



y_subject = np.asarray(num_subject)
y_truefalse = np.asarray(num_trueflase)


tokenizer = Tokenizer(num_words= len(text))
tokenizer.fit_on_texts(text)
seq = tokenizer.texts_to_sequences(text)
data = pad_sequences(seq, maxlen= 164)

split = 20000


realsamples = np.arange(data.shape[0])
np.random.shuffle(realsamples)
data = data[realsamples]
y_subject = y_subject[realsamples]
y_truefalse = y_truefalse[realsamples]


x_train = data[:split]
x_val = data[split:]
y_subject2 = y_subject[:split]
y_subject_val = y_subject[split:]
y_truefalse2 = y_truefalse[:split]
y_truefalse_val = y_truefalse[split:]



first = x_train.shape[0]

glove_dir = "C:\\Users\\Tadeas\\Downloads\\glove.6b"

embeddings_ind = {}
f = open(os.path.join(glove_dir, "glove.6B.100d.txt"), encoding="utf8")
for l in f:
  values = l.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype="float32")
  embeddings_ind[word] = coefs
f.close()

embeding_matrix = np.zeros((data.shape[0], 100))

for word, ind in tokenizer.word_index.items():
  if ind < data.shape[0]:
    embeding_vector = embeddings_ind.get(word)
    if embeding_vector is not None:
      embeding_matrix[ind] = embeding_vector



datas = keras.Input(shape=(x_train.shape[1:]))
embedded = layers.Embedding(data.shape[0], 100, input_length= 164, weights= [embeding_matrix], trainable = False)(datas)
x = layers.Conv1D(128, 5, activation="relu", padding="VALID")(embedded)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(256, 5, activation="relu")(x)
x = layers.MaxPooling1D(3)(x)
x = layers.Conv1D(256, 5, activation="relu")(x)
x = layers.Conv1D(256, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)



y_subject_prediction = layers.Dense(1, activation="sigmoid", name="subject")(x)
y_truefalse_prediction = layers.Dense(1, activation="sigmoid", name="truefalse")(x)

m = models.Model(datas, [y_subject_prediction, y_truefalse_prediction])


m.compile(optimizer=optimizers.Adam(lr = 0.001), loss={"subject": "binary_crossentropy", "truefalse": "binary_crossentropy"}, metrics=["acc"])

m.fit(x_train, {"subject": y_subject2, "truefalse": y_truefalse2}, epochs=50, batch_size=128, validation_data= (x_val, [y_subject_val, y_truefalse_val]))

m.save("fakenews5.h5")
with open('tokenizer_fakenews5.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

这是我的预测代码:

from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle

m = load_model("fakenews5.h5")

with open('tokenizer_fakenews5.pickle', 'rb') as handle:
  loaded_tokenizer = pickle.load(handle)

k = input("Enter the news: ")


seq = loaded_tokenizer.texts_to_sequences(k)
data = pad_sequences(seq, maxlen= 164)



supafinal, lol = m.predict(data)

print(supafinal)
print(lol)

这是我的输出:

[[0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]
 [0.5808463]]
[[0.00044626]
 [0.00044626]
 [0.00044626]
 [0.00044626]
 [0.00044626]
 [0.00044626]
 [0.00044626]
 [0.00044626]
 [0.00044626]
 [0.00044626]
 [0.00044626]
 [0.00044626]
 [0.00044623]
 [0.00044623]
 [0.00044623]]

这是我的训练准确性,因此如果有帮助:

Epoch 5/30
20000/20000 [==============================] - 13s 648us/sample - loss: 0.0810 - subject_loss: 0.2754 - truefalse_loss: 0.0120 - subject_acc: 0.8636 - truefalse_acc: 0.9954 - val_loss: 0.2802 - val_subject_loss: 0.3926 - val_truefalse_loss: 0.1825 - val_subject_acc: 0.8155 - val_truefalse_acc: 0.9591
Epoch 6/30
20000/20000 [==============================] - 13s 650us/sample - loss: 0.0783 - subject_loss: 0.2574 - truefalse_loss: 0.0139 - subject_acc: 0.8770 - truefalse_acc: 0.9950 - val_loss: 0.2297 - val_subject_loss: 0.3663 - val_truefalse_loss: 0.1379 - val_subject_acc: 0.8152 - val_truefalse_acc: 0.9688
Epoch 7/30
20000/20000 [==============================] - 13s 649us/sample - loss: 0.0739 - subject_loss: 0.2396 - truefalse_loss: 0.0140 - subject_acc: 0.8867 - truefalse_acc: 0.9950 - val_loss: 0.2562 - val_subject_loss: 0.4262 - val_truefalse_loss: 0.1495 - val_subject_acc: 0.8089 - val_truefalse_acc: 0.9700
Epoch 8/30
20000/20000 [==============================] - 13s 649us/sample - loss: 0.0666 - subject_loss: 0.2184 - truefalse_loss: 0.0122 - subject_acc: 0.9038 - truefalse_acc: 0.9954 - val_loss: 0.3527 - val_subject_loss: 0.5107 - val_truefalse_loss: 0.2254 - val_subject_acc: 0.8010 - val_truefalse_acc: 0.9557
Epoch 9/30
20000/20000 [==============================] - 13s 667us/sample - loss: 0.0692 - subject_loss: 0.2083 - truefalse_loss: 0.0173 - subject_acc: 0.9069 - truefalse_acc: 0.9942 - val_loss: 0.3166 - val_subject_loss: 0.4670 - val_truefalse_loss: 0.1999 - val_subject_acc: 0.7927 - val_truefalse_acc: 0.9561
Epoch 10/30
20000/20000 [==============================] - 13s 661us/sample - loss: 0.0497 - subject_loss: 0.1806 - truefalse_loss: 0.0045 - subject_acc: 0.9205 - truefalse_acc: 0.9981 - val_loss: 0.3170 - val_subject_loss: 0.4885 - val_truefalse_loss: 0.1942 - val_subject_acc: 0.8050 - val_truefalse_acc: 0.9659

Tags: textfromimportdatalayersnpvalnum

热门问题