Python机器学习朴素贝叶斯分类

2024-05-16 01:01:53 发布

您现在位置:Python中文网/ 问答频道 /正文

我有一个很小的数据集,我想用python测试NaiveBayes分类,我尝试了很多不同的方法,但代码无法运行,也没有得到预期的输出,所以最后我想有人会解决我的问题

我有巨大的数据集样本数据集是附加测试我想测试朴素贝叶斯分类在python机器学习

我的数据集链接http://www.wikisend.com/download/663276/


Tags: 数据方法代码com机器http链接download
1条回答
网友
1楼 · 发布于 2024-05-16 01:01:53

这是一个功能完整的代码,来自Jason Brownlee,机器学习大师,我通过删除csv文件并将数据作为矩阵嵌入到文件中,来适应您的需要

我买了他的几本书,并在我的日常工作中实现了他的一些时间序列预测技术

以下代码生成以下输出,您可以按如下方式构造流程:

/usr/local/bin/python3.8 /Users/fisheyjay/PycharmProjects/naive_bayes_iris/naive_bayes_iris.py Scores: [93.33333333333333, 96.66666666666667, 100.0, 93.33333333333333, 93.33333333333333] Mean Accuracy: 95.333%

Process finished with exit code 0

代码如下:

# Naive Bayes On The Iris iris_data_set
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi


# Convert string column to float
def str_column_to_float(iris_data_set, column):
    for row in iris_data_set:
        row[column] = float(row[column].strip())


# Convert string column to integer
def str_column_to_int(iris_data_set, column):
    class_values = [row[column] for row in iris_data_set]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in iris_data_set:
        row[column] = lookup[row[column]]
    return lookup


# Split a iris_data_set into k folds
def cross_validation_split(iris_data_set, n_folds):
    iris_data_set_split = list()
    iris_data_set_copy = list(iris_data_set)
    fold_size = int(len(iris_data_set) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(iris_data_set_copy))
            fold.append(iris_data_set_copy.pop(index))
        iris_data_set_split.append(fold)
    return iris_data_set_split


# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0


# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(iris_data_set, algorithm, n_folds, *args):
    folds = cross_validation_split(iris_data_set, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores


# Split the iris_data_set by class values, returns a dictionary
def separate_by_class(iris_data_set):
    separated = dict()
    for i in range(len(iris_data_set)):
        vector = iris_data_set[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated


# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers) / float(len(numbers))


# Calculate the standard deviation of a list of numbers
def standard_deviation(numbers):
    avg = mean(numbers)
    variance = sum([(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1)
    return sqrt(variance)


# Calculate the mean, standard_deviation and count for each column in a iris_data_set
def summarize_iris_data_set(iris_data_set):
    summaries = [(mean(column), standard_deviation(column), len(column)) for column in zip(*iris_data_set)]
    del (summaries[-1])
    return summaries


# Split iris_data_set by class then calculate statistics for each row
def summarize_by_class(iris_data_set):
    separated = separate_by_class(iris_data_set)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_iris_data_set(rows)
    return summaries


# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, standard_deviation):
    exponent = exp(-((x - mean) ** 2 / (2 * standard_deviation ** 2)))
    return (1 / (sqrt(2 * pi) * standard_deviation)) * exponent


# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
        for i in range(len(class_summaries)):
            mean, standard_deviation, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, standard_deviation)
    return probabilities


# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label


# Naive Bayes Algorithm
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return (predictions)


# Test Naive Bayes on Iris iris_data_set
seed(1)

iris_data_set = [['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'], ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'],
     ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'], ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
     ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa'], ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'],
     ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'], ['5.0', '3.4', '1.5', '0.2', 'Iris-setosa'],
     ['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'], ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'],
     ['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'], ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa'],
     ['4.8', '3.0', '1.4', '0.1', 'Iris-setosa'], ['4.3', '3.0', '1.1', '0.1', 'Iris-setosa'],
     ['5.8', '4.0', '1.2', '0.2', 'Iris-setosa'], ['5.7', '4.4', '1.5', '0.4', 'Iris-setosa'],
     ['5.4', '3.9', '1.3', '0.4', 'Iris-setosa'], ['5.1', '3.5', '1.4', '0.3', 'Iris-setosa'],
     ['5.7', '3.8', '1.7', '0.3', 'Iris-setosa'], ['5.1', '3.8', '1.5', '0.3', 'Iris-setosa'],
     ['5.4', '3.4', '1.7', '0.2', 'Iris-setosa'], ['5.1', '3.7', '1.5', '0.4', 'Iris-setosa'],
     ['4.6', '3.6', '1.0', '0.2', 'Iris-setosa'], ['5.1', '3.3', '1.7', '0.5', 'Iris-setosa'],
     ['4.8', '3.4', '1.9', '0.2', 'Iris-setosa'], ['5.0', '3.0', '1.6', '0.2', 'Iris-setosa'],
     ['5.0', '3.4', '1.6', '0.4', 'Iris-setosa'], ['5.2', '3.5', '1.5', '0.2', 'Iris-setosa'],
     ['5.2', '3.4', '1.4', '0.2', 'Iris-setosa'], ['4.7', '3.2', '1.6', '0.2', 'Iris-setosa'],
     ['4.8', '3.1', '1.6', '0.2', 'Iris-setosa'], ['5.4', '3.4', '1.5', '0.4', 'Iris-setosa'],
     ['5.2', '4.1', '1.5', '0.1', 'Iris-setosa'], ['5.5', '4.2', '1.4', '0.2', 'Iris-setosa'],
     ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'], ['5.0', '3.2', '1.2', '0.2', 'Iris-setosa'],
     ['5.5', '3.5', '1.3', '0.2', 'Iris-setosa'], ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'],
     ['4.4', '3.0', '1.3', '0.2', 'Iris-setosa'], ['5.1', '3.4', '1.5', '0.2', 'Iris-setosa'],
     ['5.0', '3.5', '1.3', '0.3', 'Iris-setosa'], ['4.5', '2.3', '1.3', '0.3', 'Iris-setosa'],
     ['4.4', '3.2', '1.3', '0.2', 'Iris-setosa'], ['5.0', '3.5', '1.6', '0.6', 'Iris-setosa'],
     ['5.1', '3.8', '1.9', '0.4', 'Iris-setosa'], ['4.8', '3.0', '1.4', '0.3', 'Iris-setosa'],
     ['5.1', '3.8', '1.6', '0.2', 'Iris-setosa'], ['4.6', '3.2', '1.4', '0.2', 'Iris-setosa'],
     ['5.3', '3.7', '1.5', '0.2', 'Iris-setosa'], ['5.0', '3.3', '1.4', '0.2', 'Iris-setosa'],
     ['7.0', '3.2', '4.7', '1.4', 'Iris-versicolor'], ['6.4', '3.2', '4.5', '1.5', 'Iris-versicolor'],
     ['6.9', '3.1', '4.9', '1.5', 'Iris-versicolor'], ['5.5', '2.3', '4.0', '1.3', 'Iris-versicolor'],
     ['6.5', '2.8', '4.6', '1.5', 'Iris-versicolor'], ['5.7', '2.8', '4.5', '1.3', 'Iris-versicolor'],
     ['6.3', '3.3', '4.7', '1.6', 'Iris-versicolor'], ['4.9', '2.4', '3.3', '1.0', 'Iris-versicolor'],
     ['6.6', '2.9', '4.6', '1.3', 'Iris-versicolor'], ['5.2', '2.7', '3.9', '1.4', 'Iris-versicolor'],
     ['5.0', '2.0', '3.5', '1.0', 'Iris-versicolor'], ['5.9', '3.0', '4.2', '1.5', 'Iris-versicolor'],
     ['6.0', '2.2', '4.0', '1.0', 'Iris-versicolor'], ['6.1', '2.9', '4.7', '1.4', 'Iris-versicolor'],
     ['5.6', '2.9', '3.6', '1.3', 'Iris-versicolor'], ['6.7', '3.1', '4.4', '1.4', 'Iris-versicolor'],
     ['5.6', '3.0', '4.5', '1.5', 'Iris-versicolor'], ['5.8', '2.7', '4.1', '1.0', 'Iris-versicolor'],
     ['6.2', '2.2', '4.5', '1.5', 'Iris-versicolor'], ['5.6', '2.5', '3.9', '1.1', 'Iris-versicolor'],
     ['5.9', '3.2', '4.8', '1.8', 'Iris-versicolor'], ['6.1', '2.8', '4.0', '1.3', 'Iris-versicolor'],
     ['6.3', '2.5', '4.9', '1.5', 'Iris-versicolor'], ['6.1', '2.8', '4.7', '1.2', 'Iris-versicolor'],
     ['6.4', '2.9', '4.3', '1.3', 'Iris-versicolor'], ['6.6', '3.0', '4.4', '1.4', 'Iris-versicolor'],
     ['6.8', '2.8', '4.8', '1.4', 'Iris-versicolor'], ['6.7', '3.0', '5.0', '1.7', 'Iris-versicolor'],
     ['6.0', '2.9', '4.5', '1.5', 'Iris-versicolor'], ['5.7', '2.6', '3.5', '1.0', 'Iris-versicolor'],
     ['5.5', '2.4', '3.8', '1.1', 'Iris-versicolor'], ['5.5', '2.4', '3.7', '1.0', 'Iris-versicolor'],
     ['5.8', '2.7', '3.9', '1.2', 'Iris-versicolor'], ['6.0', '2.7', '5.1', '1.6', 'Iris-versicolor'],
     ['5.4', '3.0', '4.5', '1.5', 'Iris-versicolor'], ['6.0', '3.4', '4.5', '1.6', 'Iris-versicolor'],
     ['6.7', '3.1', '4.7', '1.5', 'Iris-versicolor'], ['6.3', '2.3', '4.4', '1.3', 'Iris-versicolor'],
     ['5.6', '3.0', '4.1', '1.3', 'Iris-versicolor'], ['5.5', '2.5', '4.0', '1.3', 'Iris-versicolor'],
     ['5.5', '2.6', '4.4', '1.2', 'Iris-versicolor'], ['6.1', '3.0', '4.6', '1.4', 'Iris-versicolor'],
     ['5.8', '2.6', '4.0', '1.2', 'Iris-versicolor'], ['5.0', '2.3', '3.3', '1.0', 'Iris-versicolor'],
     ['5.6', '2.7', '4.2', '1.3', 'Iris-versicolor'], ['5.7', '3.0', '4.2', '1.2', 'Iris-versicolor'],
     ['5.7', '2.9', '4.2', '1.3', 'Iris-versicolor'], ['6.2', '2.9', '4.3', '1.3', 'Iris-versicolor'],
     ['5.1', '2.5', '3.0', '1.1', 'Iris-versicolor'], ['5.7', '2.8', '4.1', '1.3', 'Iris-versicolor'],
     ['6.3', '3.3', '6.0', '2.5', 'Iris-virginica'], ['5.8', '2.7', '5.1', '1.9', 'Iris-virginica'],
     ['7.1', '3.0', '5.9', '2.1', 'Iris-virginica'], ['6.3', '2.9', '5.6', '1.8', 'Iris-virginica'],
     ['6.5', '3.0', '5.8', '2.2', 'Iris-virginica'], ['7.6', '3.0', '6.6', '2.1', 'Iris-virginica'],
     ['4.9', '2.5', '4.5', '1.7', 'Iris-virginica'], ['7.3', '2.9', '6.3', '1.8', 'Iris-virginica'],
     ['6.7', '2.5', '5.8', '1.8', 'Iris-virginica'], ['7.2', '3.6', '6.1', '2.5', 'Iris-virginica'],
     ['6.5', '3.2', '5.1', '2.0', 'Iris-virginica'], ['6.4', '2.7', '5.3', '1.9', 'Iris-virginica'],
     ['6.8', '3.0', '5.5', '2.1', 'Iris-virginica'], ['5.7', '2.5', '5.0', '2.0', 'Iris-virginica'],
     ['5.8', '2.8', '5.1', '2.4', 'Iris-virginica'], ['6.4', '3.2', '5.3', '2.3', 'Iris-virginica'],
     ['6.5', '3.0', '5.5', '1.8', 'Iris-virginica'], ['7.7', '3.8', '6.7', '2.2', 'Iris-virginica'],
     ['7.7', '2.6', '6.9', '2.3', 'Iris-virginica'], ['6.0', '2.2', '5.0', '1.5', 'Iris-virginica'],
     ['6.9', '3.2', '5.7', '2.3', 'Iris-virginica'], ['5.6', '2.8', '4.9', '2.0', 'Iris-virginica'],
     ['7.7', '2.8', '6.7', '2.0', 'Iris-virginica'], ['6.3', '2.7', '4.9', '1.8', 'Iris-virginica'],
     ['6.7', '3.3', '5.7', '2.1', 'Iris-virginica'], ['7.2', '3.2', '6.0', '1.8', 'Iris-virginica'],
     ['6.2', '2.8', '4.8', '1.8', 'Iris-virginica'], ['6.1', '3.0', '4.9', '1.8', 'Iris-virginica'],
     ['6.4', '2.8', '5.6', '2.1', 'Iris-virginica'], ['7.2', '3.0', '5.8', '1.6', 'Iris-virginica'],
     ['7.4', '2.8', '6.1', '1.9', 'Iris-virginica'], ['7.9', '3.8', '6.4', '2.0', 'Iris-virginica'],
     ['6.4', '2.8', '5.6', '2.2', 'Iris-virginica'], ['6.3', '2.8', '5.1', '1.5', 'Iris-virginica'],
     ['6.1', '2.6', '5.6', '1.4', 'Iris-virginica'], ['7.7', '3.0', '6.1', '2.3', 'Iris-virginica'],
     ['6.3', '3.4', '5.6', '2.4', 'Iris-virginica'], ['6.4', '3.1', '5.5', '1.8', 'Iris-virginica'],
     ['6.0', '3.0', '4.8', '1.8', 'Iris-virginica'], ['6.9', '3.1', '5.4', '2.1', 'Iris-virginica'],
     ['6.7', '3.1', '5.6', '2.4', 'Iris-virginica'], ['6.9', '3.1', '5.1', '2.3', 'Iris-virginica'],
     ['5.8', '2.7', '5.1', '1.9', 'Iris-virginica'], ['6.8', '3.2', '5.9', '2.3', 'Iris-virginica'],
     ['6.7', '3.3', '5.7', '2.5', 'Iris-virginica'], ['6.7', '3.0', '5.2', '2.3', 'Iris-virginica'],
     ['6.3', '2.5', '5.0', '1.9', 'Iris-virginica'], ['6.5', '3.0', '5.2', '2.0', 'Iris-virginica'],
     ['6.2', '3.4', '5.4', '2.3', 'Iris-virginica'], ['5.9', '3.0', '5.1', '1.8', 'Iris-virginica']]

for i in range(len(iris_data_set[0]) - 1):
    str_column_to_float(iris_data_set, i)
# convert class column to integers
str_column_to_int(iris_data_set, len(iris_data_set[0]) - 1)
# evaluate algorithm
n_folds = 5
scores = evaluate_algorithm(iris_data_set, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores) / float(len(scores))))

相关问题 更多 >