朴素贝叶斯算法是一种基于贝叶斯定理的简单但非常有效的分类算法。它假设特征之间是独立的,这就是“朴素”这个词的由来。下面是一个用Java实现朴素贝叶斯分类器的示例。
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
public class NaiveBayesClassifier {
private Map<String, Map<String, Integer>> featureCountPerLabel;
private Map<String, Integer> labelCount;
private int totalSamples;
public NaiveBayesClassifier() {
featureCountPerLabel = new HashMap<>();
labelCount = new HashMap<>();
totalSamples = 0;
}
public void train(String[] features, String label) {
labelCount.put(label, labelCount.getOrDefault(label, 0) + 1);
for (String feature : features) {
featureCountPerLabel.putIfAbsent(label, new HashMap<>());
Map<String, Integer> featureCount = featureCountPerLabel.get(label);
featureCount.put(feature, featureCount.getOrDefault(feature, 0) + 1);
}
totalSamples++;
}
public String predict(String[] features) {
String bestLabel = null;
double bestProbability = Double.NEGATIVE_INFINITY;
for (String label : labelCount.keySet()) {
double logProbability = Math.log(labelCount.get(label) / (double) totalSamples);
for (String feature : features) {
int featureCount = featureCountPerLabel.getOrDefault(label, new HashMap<>()).getOrDefault(feature, 0);
logProbability += Math.log((featureCount + 1.0) / (labelCount.get(label) + featureCountPerLabel.size()));
}
if (logProbability > bestProbability) {
bestProbability = logProbability;
bestLabel = label;
}
}
return bestLabel;
}
}
测试分类器
public class NaiveBayesExample {
public static void main(String[] args) {
NaiveBayesClassifier classifier = new NaiveBayesClassifier();
// 训练数据集
classifier.train(new String[]{"sunny", "hot", "high", "false"}, "no");
classifier.train(new String[]{"sunny", "hot", "high", "true"}, "no");
classifier.train(new String[]{"overcast", "hot", "high", "false"}, "yes");
classifier.train(new String[]{"rainy", "mild", "high", "false"}, "yes");
classifier.train(new String[]{"rainy", "cool", "normal", "false"}, "yes");
classifier.train(new String[]{"rainy", "cool", "normal", "true"}, "no");
classifier.train(new String[]{"overcast", "cool", "normal", "true"}, "yes");
classifier.train(new String[]{"sunny", "mild", "high", "false"}, "no");
classifier.train(new String[]{"sunny", "cool", "normal", "false"}, "yes");
classifier.train(new String[]{"rainy", "mild", "normal", "false"}, "yes");
classifier.train(new String[]{"sunny", "mild", "normal", "true"}, "yes");
classifier.train(new String[]{"overcast", "mild", "high", "true"}, "yes");
classifier.train(new String[]{"overcast", "hot", "normal", "false"}, "yes");
classifier.train(new String[]{"rainy", "mild", "high", "true"}, "no");
// 预测
String[] newSample = {"sunny", "cool", "high", "true"};
String prediction = classifier.predict(newSample);
System.out.println("Predicted class: " + prediction);
}
}
运行结果
这段代码将根据训练数据预测给定样本的类别。
说明
train
方法用来训练分类器,记录每个类别下各个特征的出现次数。predict
方法用来预测新样本的类别。它根据贝叶斯定理计算每个类别的概率,并选择概率最大的类别作为预测结果。
这个实现使用了简单的加法平滑(加1平滑)来处理某些特征在训练数据中未出现的情况。