【ML】R机器学习介绍第二部分
分类
-
决策树
rpart
1library(rpart)
2library(rpart.plot)
3
4set.seed(42)
5fit <- rpart(classes ~ .,
6 data = train_data,
7 method = "class",
8 control = rpart.control(xval = 10,
9 minbucket = 2,
10 cp = 0),
11 parms = list(split = "information"))
12
13rpart.plot(fit, extra = 100)
14

-
随机森林
随机森林的预测是基于多分类树的生成。它们可以用于分类和回归任务。这里,我展示了一个分类任务。
1set.seed(42)
2model_rf <- caret::train(classes ~ .,
3 data = train_data,
4 method = "rf",
5 preProcess = c("scale", "center"),
6 trControl = trainControl(method = "repeatedcv",
7 number = 5,
8 repeats = 3,
9 savePredictions = TRUE,
10 verboseIter = FALSE))
11
当您指定saveforecasts = TRUE时,您可以使用model_rf$pred访问交叉验证结果。
1model_rf
1model_rf$finalModel$confusion
处理不平衡数据集
幸运的是,caret可以很容易地将过和欠抽样技术与交叉验证重采样结合在一起。我们可以简单地将抽样选项添加到我们的trainControl系统中,并选择欠抽样。剩下的和原来的模型一样。
1set.seed(42)
2model_rf_down <- caret::train(classes ~ .,
3 data = train_data,
4 method = "rf",
5 preProcess = c("scale", "center"),
6 trControl = trainControl(method = "repeatedcv",
7 number = 10,
8 repeats = 10,
9 savePredictions = TRUE,
10 verboseIter = FALSE,
11 sampling = "down"))
12
1model_rf_down
特征的重要性
1imp <- model_rf$finalModel$importance
2imp[order(imp, decreasing = TRUE), ]
1importance <- varImp(model_rf, scale = TRUE)
2plot(importance)
3

预测测试数据集
1confusionMatrix(predict(model_rf, test_data), as.factor(test_data$classes))
1results <- data.frame(actual = test_data$classes,
2 predict(model_rf, test_data, type = "prob"))
3
4results$prediction <- ifelse(results$benign > 0.5, "benign",
5 ifelse(results$malignant > 0.5, "malignant", NA))
6
7results$correct <- ifelse(results$actual == results$prediction, TRUE, FALSE)
8
9ggplot(results, aes(x = prediction, fill = correct)) +
10 geom_bar(position = "dodge")

1ggplot(results, aes(x = prediction, y = benign, color = correct, shape = correct)) +
2 geom_jitter(size = 3, alpha = 0.6)

极梯度提升树
极端梯度提升(XGBoost)是一种用于监督学习的梯度提升的快速改进实现。
XGBoost使用了一种更正则化的模型正规化来控制过度拟合,从而使其具有更好的性能。xgboost的开发人员陈天琦说
XGBoost是一个树集成模型,它表示一组分类树和回归树(CART)的预测总和。在这一点上,XGBoost类似于Random Forests,但它使用了一种不同的模型训练方法。可用于分类和回归任务。这里,我展示了一个分类任务。
1set.seed(42)
2model_xgb <- caret::train(classes ~ .,
3 data = train_data,
4 method = "xgbTree",
5 preProcess = c("scale", "center"),
6 trControl = trainControl(method = "repeatedcv",
7 number = 5,
8 repeats = 3,
9 savePredictions = TRUE,
10 verboseIter = FALSE))
11
1model_xgb
特征的重要性
1importance <- varImp(model_xgb, scale = TRUE)
2plot(importance)
3

预测测试数据集
1confusionMatrix(predict(model_xgb, test_data), as.factor(test_data$classes))
1results <- data.frame(actual = test_data$classes,
2 predict(model_xgb, test_data, type = "prob"))
3
4results$prediction <- ifelse(results$benign > 0.5, "benign",
5 ifelse(results$malignant > 0.5, "malignant", NA))
6
7results$correct <- ifelse(results$actual == results$prediction, TRUE, FALSE)
8
9ggplot(results, aes(x = prediction, fill = correct)) +
10 geom_bar(position = "dodge")

1ggplot(results, aes(x = prediction, y = benign, color = correct, shape = correct)) +
2 geom_jitter(size = 3, alpha = 0.6)

caret可用模型
https://topepo.github.io/caret/available-models.html
特征选择
对整个数据集进行特征选择会导致预测偏差,因此我们需要单独对训练数据进行整个建模过程!
-
相关性
所有特征之间的相关性通过corrplot包计算和可视化。然后,我将删除所有相关性大于0.7的特性,使该特征与较低的平均值保持一致。
1library(corrplot)
2
3# calculate correlation matrix
4corMatMy <- cor(train_data[, -1])
5corrplot(corMatMy, order = "hclust")

1#Apply correlation filter at 0.70,
2highlyCor <- colnames(train_data[, -1])[findCorrelation(corMatMy, cutoff = 0.7, verbose = TRUE)]
1highlyCor
1train_data_cor <- train_data[, which(!colnames(train_data) %in% highlyCor)]
-
递归特征消除(RFE)
选择特征的另一种方法是使用递归特征消除。RFE使用一种随机森林算法来测试特征的组合,并对每个特征进行准确率评分。得分最高的组合通常是优选的。
1set.seed(7)
2results_rfe <- rfe(x = train_data[, -1],
3 y = as.factor(train_data$classes),
4 sizes = c(1:9),
5 rfeControl = rfeControl(functions = rfFuncs, method = "cv", number = 10))
6
1predictors(results_rfe)
1train_data_rfe <- train_data[, c(1, which(colnames(train_data) %in% predictors(results_rfe)))]
-
遗传算法(GA)
遗传算法(GA)是基于自然选择的进化原理开发出来的:它的目标是通过建模选择一组特定的基因型来优化个体群。在每一代(即迭代)中,每个个体的适应度都是根据他们的基因型来计算的。然后,优胜劣汰的个体被选出来生产下一代。这一代的个体将拥有由亲本等位基因重组而成的基因型。这些新的基因型将再次决定每个个体的适应性。这个选择过程在特定的世代中迭代,(理想地)导致在基因库中固定最适合的等位基因。
这种优化概念也可以应用于非进化模型,比如机器学习中的特征选择过程。
1set.seed(27)
2model_ga <- gafs(x = train_data[, -1],
3 y = as.factor(train_data$classes),
4 iters = 10, # generations of algorithm
5 popSize = 10, # population size for each generation
6 levels = c("malignant", "benign"),
7 gafsControl = gafsControl(functions = rfGA, # Assess fitness with RF
8 method = "cv", # 10 fold cross validation
9 genParallel = TRUE, # Use parallel programming
10 allowParallel = TRUE))
1plot(model_ga)

1train_data_ga <- train_data[, c(1, which(colnames(train_data) %in% model_ga$ga$final))]
用caret做超参数调优
-
笛卡尔网格
-
mtry:在每一次分割中随机抽取的候选变量的数量。
1set.seed(42)
2grid <- expand.grid(mtry = c(1:10))
3
4model_rf_tune_man <- caret::train(classes ~ .,
5 data = train_data,
6 method = "rf",
7 preProcess = c("scale", "center"),
8 trControl = trainControl(method = "repeatedcv",
9 number = 10,
10 repeats = 10,
11 savePredictions = TRUE,
12 verboseIter = FALSE),
13 tuneGrid = grid)
14
1model_rf_tune_man
1plot(model_rf_tune_man)

-
随机选择
1set.seed(42)
2model_rf_tune_auto <- caret::train(classes ~ .,
3 data = train_data,
4 method = "rf",
5 preProcess = c("scale", "center"),
6 trControl = trainControl(method = "repeatedcv",
7 number = 10,
8 repeats = 10,
9 savePredictions = TRUE,
10 verboseIter = FALSE,
11 search = "random"),
12 tuneGrid = grid,
13 tuneLength = 15)
14
1model_rf_tune_auto
1plot(model_rf_tune_auto)

使用h2o做网格搜索
R package h2o为h2o提供了一个方便的接口,h2o是一个开源的机器学习和深度学习平台。H2O为分类、回归和深度学习提供了广泛的通用机器学习算法。
1library(h2o)
2h2o.init(nthreads = -1)
1h2o.no_progress()
2bc_data_hf <- as.h2o(bc_data)
1h2o.describe(bc_data_hf) %>%
2 gather(x, y, Zeros:Sigma) %>%
3 mutate(group = ifelse(x %in% c("Min", "Max", "Mean"), "min, mean, max",
4 ifelse(x %in% c("NegInf", "PosInf"), "Inf", "sigma, zeros"))) %>%
5 ggplot(aes(x = Label, y = as.numeric(y), color = x)) +
6 geom_point(size = 4, alpha = 0.6) +
7 scale_color_brewer(palette = "Set1") +
8 theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
9 facet_grid(group ~ ., scales = "free") +
10 labs(x = "Feature",
11 y = "Value",
12 color = "")

1library(reshape2) # for melting
2
3bc_data_hf[, 1] <- h2o.asfactor(bc_data_hf[, 1])
4
5cor <- h2o.cor(bc_data_hf)
6rownames(cor) <- colnames(cor)
7
8melt(cor) %>%
9 mutate(Var2 = rep(rownames(cor), nrow(cor))) %>%
10 mutate(Var2 = factor(Var2, levels = colnames(cor))) %>%
11 mutate(variable = factor(variable, levels = colnames(cor))) %>%
12 ggplot(aes(x = variable, y = Var2, fill = value)) +
13 geom_tile(width = 0.9, height = 0.9) +
14 scale_fill_gradient2(low = "white", high = "red", name = "Cor.") +
15 theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
16 labs(x = "",
17 y = "")

训练,验证和测试数据集
1splits <- h2o.splitFrame(bc_data_hf,
2 ratios = c(0.7, 0.15),
3 seed = 1)
4
5train <- splits[[1]]
6valid <- splits[[2]]
7test <- splits[[3]]
8
9response <- "classes"
10features <- setdiff(colnames(train), response)
1summary(as.factor(train$classes), exact_quantiles = TRUE)
2summary(as.factor(valid$classes), exact_quantiles = TRUE)
3summary(as.factor(test$classes), exact_quantiles = TRUE)
1pca <- h2o.prcomp(training_frame = train,
2 x = features,
3 validation_frame = valid,
4 transform = "NORMALIZE",
5 impute_missing = TRUE,
6 k = 3,
7 seed = 42)
8
9eigenvec <- as.data.frame(pca@model$eigenvectors)
10eigenvec$label <- features
11
12library(ggrepel)
13ggplot(eigenvec, aes(x = pc1, y = pc2, label = label)) +
14 geom_point(color = "navy", alpha = 0.7) +
15 geom_text_repel()
16

分类
-
随机森林
1hyper_params <- list(
2 ntrees = c(25, 50, 75, 100),
3 max_depth = c(10, 20, 30),
4 min_rows = c(1, 3, 5)
5 )
6
7search_criteria <- list(
8 strategy = "RandomDiscrete",
9 max_models = 50,
10 max_runtime_secs = 360,
11 stopping_rounds = 5,
12 stopping_metric = "AUC",
13 stopping_tolerance = 0.0005,
14 seed = 42
15 )
16
1f_grid <- h2o.grid(algorithm = "randomForest", # h2o.randomForest,
2 # alternatively h2o.gbm
3 # for Gradient boosting trees
4 x = features,
5 y = response,
6 grid_id = "rf_grid",
7 training_frame = train,
8 validation_frame = valid,
9 nfolds = 25,
10 fold_assignment = "Stratified",
11 hyper_params = hyper_params,
12 search_criteria = search_criteria,
13 seed = 42
14 )
15
1# performance metrics where smaller is better -> order with decreasing = FALSE
2sort_options_1 <- c("mean_per_class_error", "mse", "err", "logloss")
3
4for (sort_by_1 in sort_options_1) {
5
6 grid <- h2o.getGrid("rf_grid", sort_by = sort_by_1, decreasing = FALSE)
7
8 model_ids <- grid@model_ids
9 best_model <- h2o.getModel(model_ids[[1]])
10
11 h2o.saveModel(best_model, path="models", force = TRUE)
12
13}
14
15
16# performance metrics where bigger is better -> order with decreasing = TRUE
17sort_options_2 <- c("auc", "precision", "accuracy", "recall", "specificity")
18
19for (sort_by_2 in sort_options_2) {
20
21 grid <- h2o.getGrid("rf_grid", sort_by = sort_by_2, decreasing = TRUE)
22
23 model_ids <- grid@model_ids
24 best_model <- h2o.getModel(model_ids[[1]])
25
26 h2o.saveModel(best_model, path = "models", force = TRUE)
27
28}
1files <- list.files(path = "models")
1rf_models <- files[grep("rf_grid_model", files)]
2
3for (model_id in rf_models) {
4
5 path <- paste0(getwd(), "/models/", model_id)
6 best_model <- h2o.loadModel(path)
7 mse_auc_test <- data.frame(model_id = model_id,
8 mse = h2o.mse(h2o.performance(best_model, test)),
9 auc = h2o.auc(h2o.performance(best_model, test)))
10
11 if (model_id == rf_models[[1]]) {
12
13 mse_auc_test_comb <- mse_auc_test
14
15 } else {
16
17 mse_auc_test_comb <- rbind(mse_auc_test_comb, mse_auc_test)
18
19 }
20}
1mse_auc_test_comb %>%
2 gather(x, y, mse:auc) %>%
3 ggplot(aes(x = model_id, y = y, fill = model_id)) +
4 facet_grid(x ~ ., scales = "free") +
5 geom_bar(stat = "identity", alpha = 0.8, position = "dodge") +
6 scale_fill_brewer(palette = "Set1") +
7 theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
8 plot.margin = unit(c(0.5, 0, 0, 1.5), "cm")) +
9 labs(x = "", y = "value", fill = "")

1for (model_id in rf_models) {
2
3 best_model <- h2o.getModel(model_id)
4
5 finalRf_predictions <- data.frame(model_id = rep(best_model@model_id,
6 nrow(test)),
7 actual = as.vector(test$classes),
8 as.data.frame(h2o.predict(object = best_model,
9 newdata = test)))
10
11 finalRf_predictions$accurate <- ifelse(finalRf_predictions$actual ==
12 finalRf_predictions$predict,
13 "yes", "no")
14
15 finalRf_predictions$predict_stringent <- ifelse(finalRf_predictions$benign > 0.8,
16 "benign",
17 ifelse(finalRf_predictions$malignant
18 > 0.8, "malignant", "uncertain"))
19
20 finalRf_predictions$accurate_stringent <- ifelse(finalRf_predictions$actual ==
21 finalRf_predictions$predict_stringent, "yes",
22 ifelse(finalRf_predictions$predict_stringent ==
23 "uncertain", "na", "no"))
24
25 if (model_id == rf_models[[1]]) {
26
27 finalRf_predictions_comb <- finalRf_predictions
28
29 } else {
30
31 finalRf_predictions_comb <- rbind(finalRf_predictions_comb, finalRf_predictions)
32
33 }
34}
1finalRf_predictions_comb %>%
2 ggplot(aes(x = actual, fill = accurate)) +
3 geom_bar(position = "dodge") +
4 scale_fill_brewer(palette = "Set1") +
5 facet_wrap(~ model_id, ncol = 2) +
6 labs(fill = "Werenpredictionsnaccurate?",
7 title = "Default predictions")

1finalRf_predictions_comb %>%
2 subset(accurate_stringent != "na") %>%
3 ggplot(aes(x = actual, fill = accurate_stringent)) +
4 geom_bar(position = "dodge") +
5 scale_fill_brewer(palette = "Set1") +
6 facet_wrap(~ model_id, ncol = 2) +
7 labs(fill = "Werenpredictionsnaccurate?",
8 title = "Stringent predictions")

1rf_model <- h2o.loadModel("models/rf_grid_model_0")
1h2o.varimp_plot(rf_model)

1h2o.mean_per_class_error(rf_model, train = TRUE, valid = TRUE, xval = TRUE)
1h2o.confusionMatrix(rf_model, valid = TRUE)
1plot(rf_model,
2 timestep = "number_of_trees",
3 metric = "classification_error")

1plot(rf_model,
2 timestep = "number_of_trees",
3 metric = "logloss")

1plot(rf_model,
2 timestep = "number_of_trees",
3 metric = "AUC")

1plot(rf_model,
2 timestep = "number_of_trees",
3 metric = "rmse")

1h2o.auc(rf_model, train = TRUE)
2h2o.auc(rf_model, valid = TRUE)
3h2o.auc(rf_model, xval = TRUE)
4perf <- h2o.performance(rf_model, test)
5perf
1plot(perf)

1perf@metrics$thresholds_and_metric_scores %>%
2 ggplot(aes(x = fpr, y = tpr)) +
3 geom_point() +
4 geom_line() +
5 geom_abline(slope = 1, intercept = 0) +
6 labs(x = "False Positive Rate",
7 y = "True Positive Rate")

1h2o.logloss(perf)
2h2o.mse(perf)
3h2o.auc(perf)
4head(h2o.metric(perf))
1finalRf_predictions <- data.frame(actual = as.vector(test$classes),
2 as.data.frame(h2o.predict(object = rf_model,
3 newdata = test)))
4
5finalRf_predictions$accurate <- ifelse(finalRf_predictions$actual ==
6 finalRf_predictions$predict, "yes", "no")
7
8finalRf_predictions$predict_stringent <- ifelse(finalRf_predictions$benign > 0.8, "benign",
9 ifelse(finalRf_predictions$malignant
10 > 0.8, "malignant", "uncertain"))
11finalRf_predictions$accurate_stringent <- ifelse(finalRf_predictions$actual ==
12 finalRf_predictions$predict_stringent, "yes",
13 ifelse(finalRf_predictions$predict_stringent ==
14 "uncertain", "na", "no"))
15
16finalRf_predictions %>%
17 group_by(actual, predict) %>%
18 dplyr::summarise(n = n())
1finalRf_predictions %>%
2 ggplot(aes(x = actual, fill = accurate)) +
3 geom_bar(position = "dodge") +
4 scale_fill_brewer(palette = "Set1") +
5 labs(fill = "Werenpredictionsnaccurate?",
6 title = "Default predictions")

1finalRf_predictions %>%
2 subset(accurate_stringent != "na") %>%
3 ggplot(aes(x = actual, fill = accurate_stringent)) +
4 geom_bar(position = "dodge") +
5 scale_fill_brewer(palette = "Set1") +
6 labs(fill = "Werenpredictionsnaccurate?",
7 title = "Stringent predictions")

1df <- finalRf_predictions[, c(1, 3, 4)]
2
3thresholds <- seq(from = 0, to = 1, by = 0.1)
4
5prop_table <- data.frame(threshold = thresholds, prop_true_b = NA, prop_true_m = NA)
6
7for (threshold in thresholds) {
8 pred <- ifelse(df$benign > threshold, "benign", "malignant")
9 pred_t <- ifelse(pred == df$actual, TRUE, FALSE)
10
11 group <- data.frame(df, "pred" = pred_t) %>%
12 group_by(actual, pred) %>%
13 dplyr::summarise(n = n())
14
15 group_b <- filter(group, actual == "benign")
16
17 prop_b <- sum(filter(group_b, pred == TRUE)$n) / sum(group_b$n)
18 prop_table[prop_table$threshold == threshold, "prop_true_b"] <- prop_b
19
20 group_m <- filter(group, actual == "malignant")
21
22 prop_m <- sum(filter(group_m, pred == TRUE)$n) / sum(group_m$n)
23 prop_table[prop_table$threshold == threshold, "prop_true_m"] <- prop_m
24}
25
26prop_table %>%
27 gather(x, y, prop_true_b:prop_true_m) %>%
28 ggplot(aes(x = threshold, y = y, color = x)) +
29 geom_point() +
30 geom_line() +
31 scale_color_brewer(palette = "Set1") +
32 labs(y = "proportion of true predictions",
33 color = "b: benign casesnm: malignant cases")

如果你对更多的机器学习帖子感兴趣,可以在我的博客上查看一下machine_learning的分类列表
https://shirinsplayground.netlify.com/categories/#posts-list-machine-learning – https://shiring.github.io/categories.html#machine_learning-ref
1stopCluster(cl)
2h2o.shutdown()
1sessionInfo()
您有什么想法,请留言。
文章推荐:
版权声明:作者保留权利。文章为作者独立观点,不代表数据人网立场。严禁修改,转载请注明原文链接:http://shujuren.org/article/764.html
数据人网:数据人学习,交流和分享的平台,诚邀您创造和分享数据知识,共建和共享数据智库。
请关注“恒诺新知”微信公众号,感谢“R语言“,”数据那些事儿“,”老俊俊的生信笔记“,”冷🈚️思“,“珞珈R”,“生信星球”的支持!