【ML】R机器学习介绍第一部分
笔者邀请您,先思考:
1 您使用哪些R包来做机器学习?
2 您使用机器学习解决什么问题?
这是我在德国海德堡大学于2018年6月28日所做的关于R的机器学习介绍的研讨会的幻灯片。整个研讨会的代码可以在视频下面找到。
研讨会介绍了机器学习的基本知识。通过一个示例数据集,我在R中使用caret和h2o包完成了一个标准的机器学习工作流:
-
读取数据
-
探索性数据分析
-
缺失值
-
特征工程
-
训练和测试划分
-
使用随机森林,梯度提升,ANN等训练模型
-
超参数调优
设置
所有的分析都是在R中使用RStudio进行的。有关包括R版本、操作系统和包版本在内的详细会话信息,请参阅本文末尾的sessionInfo()输出。
所有的图片都是ggplot2生成。
-
包
1library(tidyverse)
2library(readr)
3library(mice)
数据准备
这个分析例子我使用的数据集是 Breast Cancer Wisconsin (Diagnostic) Dataset。这个数据集可以在UC Irvine Machine Learning Repository下载。
第一个数据集查看预测类:
-
恶性的
-
良性乳腺质量。
这些特征描述了细胞核的特性,是通过对乳腺肿块细针吸出物(FNA)的图像分析得出的:
-
样本ID(代码号)
-
丛厚度
-
细胞大小均匀性
-
细胞形状均匀性
-
边际附着力
-
单个上皮细胞大小
-
裸核数
-
乏味的染色质
-
正常核数
-
有丝分裂
-
类,即诊断
1bc_data <- read_delim("datasets/breast-cancer-wisconsin.data.txt",
2 delim = ",",
3 col_names = c("sample_code_number",
4 "clump_thickness",
5 "uniformity_of_cell_size",
6 "uniformity_of_cell_shape",
7 "marginal_adhesion",
8 "single_epithelial_cell_size",
9 "bare_nuclei",
10 "bland_chromatin",
11 "normal_nucleoli",
12 "mitosis",
13 "classes")) %>%
14 mutate(bare_nuclei = as.numeric(bare_nuclei),
15 classes = ifelse(classes == "2", "benign",
16 ifelse(classes == "4", "malignant", NA)))
1summary(bc_data)
数据缺失
1md.pattern(bc_data, plot = FALSE)
1bc_data <- bc_data %>%
2 drop_na() %>%
3 select(classes, everything(), -sample_code_number)
4head(bc_data)
缺失值可以用mice package来插补。
更多信息和教程与代码:
https://shirinsplayground.netlify.com/2018/04/flu_prediction/
数据探索
-
分类响应变量
1ggplot(bc_data, aes(x = classes, fill = classes)) +
2 geom_bar()

关于处理不平衡类更多信息:
https://shiring.github.io/machine_learning/2017/04/02/unbalanced
-
回归响应变量
1ggplot(bc_data, aes(x = clump_thickness)) +
2 geom_histogram(bins = 10)

-
特征集
1gather(bc_data, x, y, clump_thickness:mitosis) %>%
2 ggplot(aes(x = y, color = classes, fill = classes)) +
3 geom_density(alpha = 0.3) +
4 facet_wrap( ~ x, scales = "free", ncol = 3)

-
相关图
1co_mat_benign <- filter(bc_data, classes == "benign") %>%
2 select(-1) %>%
3 cor()
4
5co_mat_malignant <- filter(bc_data, classes == "malignant") %>%
6 select(-1) %>%
7 cor()
8
9library(igraph)
10g_benign <- graph.adjacency(co_mat_benign,
11 weighted = TRUE,
12 diag = FALSE,
13 mode = "upper")
14
15g_malignant <- graph.adjacency(co_mat_malignant,
16 weighted = TRUE,
17 diag = FALSE,
18 mode = "upper")
19
20
21# http://kateto.net/networks-r-igraph
22
23cut.off_b <- mean(E(g_benign)$weight)
24cut.off_m <- mean(E(g_malignant)$weight)
25
26g_benign_2 <- delete_edges(g_benign, E(g_benign)[weight < cut.off_b])
27g_malignant_2 <- delete_edges(g_malignant, E(g_malignant)[weight < cut.off_m])
28
29c_g_benign_2 <- cluster_fast_greedy(g_benign_2)
30c_g_malignant_2 <- cluster_fast_greedy(g_malignant_2)
31par(mfrow = c(1,2))
32
33plot(c_g_benign_2, g_benign_2,
34 vertex.size = colSums(co_mat_benign) * 10,
35 vertex.frame.color = NA,
36 vertex.label.color = "black",
37 vertex.label.cex = 0.8,
38 edge.width = E(g_benign_2)$weight * 15,
39 layout = layout_with_fr(g_benign_2),
40 main = "Benign tumors")
41
42plot(c_g_malignant_2, g_malignant_2,
43 vertex.size = colSums(co_mat_malignant) * 10,
44 vertex.frame.color = NA,
45 vertex.label.color = "black",
46 vertex.label.cex = 0.8,
47 edge.width = E(g_malignant_2)$weight * 15,
48 layout = layout_with_fr(g_malignant_2),
49 main = "Malignant tumors")
50

主成分分析
1library(ellipse)
2
3# perform pca and extract scores
4pcaOutput <- prcomp(as.matrix(bc_data[, -1]), scale = TRUE, center = TRUE)
5pcaOutput2 <- as.data.frame(pcaOutput$x)
6
7# define groups for plotting
8pcaOutput2$groups <- bc_data$classes
9
10centroids <- aggregate(cbind(PC1, PC2) ~ groups, pcaOutput2, mean)
11
12conf.rgn <- do.call(rbind, lapply(unique(pcaOutput2$groups), function(t)
13 data.frame(groups = as.character(t),
14 ellipse(cov(pcaOutput2[pcaOutput2$groups == t, 1:2]),
15 centre = as.matrix(centroids[centroids$groups == t, 2:3]),
16 level = 0.95),
17 stringsAsFactors = FALSE)))
18
19ggplot(data = pcaOutput2, aes(x = PC1, y = PC2, group = groups, color = groups)) +
20 geom_polygon(data = conf.rgn, aes(fill = groups), alpha = 0.2) +
21 geom_point(size = 2, alpha = 0.6) +
22 labs(color = "",
23 fill = "")
24

多维缩放
1select(bc_data, -1) %>%
2 dist() %>%
3 cmdscale %>%
4 as.data.frame() %>%
5 mutate(group = bc_data$classes) %>%
6 ggplot(aes(x = V1, y = V2, color = group)) +
7 geom_point()

t-SNE降维
1library(tsne)
2
3select(bc_data, -1) %>%
4 dist() %>%
5 tsne() %>%
6 as.data.frame() %>%
7 mutate(group = bc_data$classes) %>%
8 ggplot(aes(x = V1, y = V2, color = group)) +
9 geom_point()

R的机器学习包
caret
1# configure multicore
2library(doParallel)
3cl <- makeCluster(detectCores())
4registerDoParallel(cl)
5
6library(caret)
训练,验证和测试数据集
1set.seed(42)
2index <- createDataPartition(bc_data$classes, p = 0.7, list = FALSE)
3train_data <- bc_data[index, ]
4test_data <- bc_data[-index, ]
5
6bind_rows(data.frame(group = "train", train_data),
7 data.frame(group = "test", test_data)) %>%
8 gather(x, y, clump_thickness:mitosis) %>%
9 ggplot(aes(x = y, color = group, fill = group)) +
10 geom_density(alpha = 0.3) +
11 facet_wrap( ~ x, scales = "free", ncol = 3)

回归
1set.seed(42)
2model_glm <- caret::train(clump_thickness ~ .,
3 data = train_data,
4 method = "glm",
5 preProcess = c("scale", "center"),
6 trControl = trainControl(method = "repeatedcv",
7 number = 10,
8 repeats = 10,
9 savePredictions = TRUE,
10 verboseIter = FALSE))
11
1model_glm
1predictions <- predict(model_glm, test_data)
1# model_glm$finalModel$linear.predictors == model_glm$finalModel$fitted.values
2data.frame(residuals = resid(model_glm),
3 predictors = model_glm$finalModel$linear.predictors) %>%
4 ggplot(aes(x = predictors, y = residuals)) +
5 geom_jitter() +
6 geom_smooth(method = "lm")

1# y == train_data$clump_thickness
2data.frame(residuals = resid(model_glm),
3 y = model_glm$finalModel$y) %>%
4 ggplot(aes(x = y, y = residuals)) +
5 geom_jitter() +
6 geom_smooth(method = "lm")

1data.frame(actual = test_data$clump_thickness,
2 predicted = predictions) %>%
3 ggplot(aes(x = actual, y = predicted)) +
4 geom_jitter() +
5 geom_smooth(method = "lm")

作者:Dr. Shirin Glander
原文链接:
https://shirinsplayground.netlify.com/2018/06/intro_to_ml_workshop_heidelberg/
您有什么想法,请留言。
文章推荐:
版权声明:作者保留权利。文章为作者独立观点,不代表数据人网立场。严禁修改,转载请注明原文链接:http://shujuren.org/article/764.html
数据人网:数据人学习,交流和分享的平台,诚邀您创造和分享数据知识,共建和共享数据智库。
请关注“恒诺新知”微信公众号,感谢“R语言“,”数据那些事儿“,”老俊俊的生信笔记“,”冷🈚️思“,“珞珈R”,“生信星球”的支持!