R语言学习:如何引用R和R包,关系型数据处理,双变量相关性检验和可视化,fastshap包
2021年第46周。
这一周R语言学习,记录如下。
01
tidyverse学习代码片段
每天编写R脚本,我都会用到tidyverse包。
tidyverse包,让你高效完成数据科学任务。我会通过各种方式来学习和增进tidyverse的知识和技能。
tidyverse学习代码片段,你可以亲自实践。
library(tidyverse)
library(nycflights13)
library(lubridate)
# 数据集检视
glimpse(flights)
flights %>%
slice_head(n = 100) %>%
View
# 1 mutate函数
flights %>%
mutate(long_flight = (air_time >= 6 * 60)) %>%
slice_head(n = 100) %>%
View()
flights %>%
mutate(long_flight = (air_time >= 6 * 60)) %>%
count(long_flight)
flights %>%
count(long_flight = air_time >= 6 * 60)
flights %>%
count(flight_path = str_c(origin, " -> ", dest), sort = TRUE)
# 2 group_by和summarise函数
flights %>%
group_by(date = make_date(year, month, day)) %>%
summarise(flights_n = n(), air_time_mean = mean(air_time, na.rm = TRUE)) %>%
ungroup()
# 3 随机抽样数据集
flights %>%
slice_sample(n = 15)
flights %>%
slice_sample(prop = 0.15)
# 4 生成日期列
flights %>%
select(year, month, day) %>%
mutate(date = make_date(year, month, day)) %>%
slice_head(n = 100) %>%
View
# 5 数字解析
numbers_1 <- tibble(number = c("#1", "Number8", "How are you 3"))
numbers_1 %>% mutate(number = parse_number(number))
# 6 选择列starts_with函数|ends_with函数|contains函数
flights %>%
select(starts_with("dep_"))
flights %>%
select(ends_with("hour"))
flights %>%
select(contains("hour"))
# 7 case_when使用
flights %>%
mutate(origin = case_when(
(origin == "EWR") & dep_delay > 20 ~ "Newark International Airport - DELAYED",
(origin == "EWR") & dep_delay <= 20 ~ "Newark International Airport - ON TIME DEPARTURE",
TRUE ~ "Other"
)) %>%
count(origin)
# 8 str_replace_all函数使用
flights %>%
mutate(origin = str_replace_all(origin, c(
"^EWR$" = "Newark International", "^JFK$" = "John F. Kennedy International"
))) %>%
count(origin)
# 9 filter函数和group_by函数结合
flights_top_carriers <- flights %>%
group_by(carrier) %>%
filter(n() >= 10000) %>%
ungroup()
# 10 其它操作
beginning_with_am <- airlines %>%
filter(name %>% str_detect("^Am"))
flights %>%
anti_join(beginning_with_am, by = "carrier")
airline_names <- flights %>%
left_join(airlines, by = "carrier")
airline_names %>%
count(name) %>%
ggplot(aes(name, n)) +
geom_col()
airline_names %>%
count(name) %>%
mutate(name = fct_reorder(name, n)) %>%
ggplot(aes(name, n)) +
geom_col()
airline_names %>%
count(name) %>%
mutate(name = fct_reorder(name, n)) %>%
ggplot(aes(name, n)) +
geom_col() +
coord_flip()
crossing(
customer_channel = c("Bus", "Car"),
customer_status = c("New", "Repeat"),
spend_range = c("$0-$10", "$10-$20", "$20-$50", "$50+"))
summary <- function(data, col_names, na.rm = TRUE) {
data %>%
summarise(across({{ col_names }},
list(
min = min,
max = max,
median = median,
mean = mean
),
na.rm = na.rm,
.names = "{col}_{fn}"
))
}
airline_names %>%
summary(c(air_time, arr_delay))
airline_names %>%
group_by(carrier) %>%
summary(c(air_time, arr_delay))
02
如何引用R和R包?
我利用R语言做科研工作。
参考文献如何引用R和R包?可以使用citation函数。
# R 引用
citation()
# R包 引用
citation(package = "scorecard")
print(citation(package = "scorecard"), style = "text")
library(purrr)
c("dplyr", "scorecard") %>%
map(citation) %>%
print(style = "text")
运行结果
03
关系型数据处理
R4DS书籍第10章学习。
关系型数据介绍和处理。
数据科学任务常用技能之一,我们所要处理的业务问题,通常需要来自多张表的数据,这就需要数据的关联和集成。
#############
#关系型数据
#dplyr包
#学习资料:R4DS第10章
############
# 准备工作
library(tidyverse)
library(nycflights13)
data(package="nycflights13")
glimpse(airlines)
glimpse(airports)
glimpse(planes)
glimpse(weather)
glimpse(flights)
# 主键判断
# 用于唯一标识表的样本
planes %>%
count(tailnum) %>%
filter(n > 1)
weather %>%
count(year, month, day, hour, origin) %>%
filter(n > 1)
flights %>%
count(year, month, day, flight) %>%
filter(n > 1)
flights %>%
count(year, month, day, tailnum) %>%
filter(n > 1)
# 表格没有主键的解决方案
# 使用mutate()或者row_number()增加一个标记
# Mutating Joins
flights2 <- flights %>%
select(year:day, hour, origin, dest, tailnum, carrier)
flights2 %>%
slice_head(n = 100) %>%
View
# 想知道full airline name的全名
flights2 %>%
select(-origin, -dest) %>%
left_join(airlines, by = "carrier") %>%
slice_head(n = 100) %>%
View
# 等价于
flights2 %>%
select(-origin, -dest) %>%
mutate(
name = airlines$name[match(carrier, airlines$carrier)]
) %>%
slice_head(n = 100) %>%
View
# 理解Joins
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y4"
)
# inner_join
x %>%
inner_join(y, by = "key") %>%
View
# outer joins
# 1) left join
# 2) right join
# 3) full join
# Filter Joins
# 1) semi_join
# 2) anti_join
top_dest <- flights %>%
count(dest, sort = TRUE) %>%
head(10)
top_dest %>%
View
flights %>%
filter(dest %in% top_dest$dest) %>%
slice_head(n = 100) %>%
View
flights %>%
semi_join(top_dest) %>%
slice_head(n = 100) %>%
View
各种join的理解,可以进一步阅读这篇文章。
04
双变量相关性检验和可视化
ggpubr包可以便捷实现双变量相关性检验和可视化。
library(tidyverse)
library(ggpubr)
# 导入数据
my_data <- mtcars
# 数据检视
head(my_data)
# 元数据
glimpse(my_data)
str(my_data)
# 探索mpg与wt的相关性
ggscatter(
my_data,
x = "mpg",
y = "wt",
add = "reg.line",
conf.int = TRUE,
cor.coef = TRUE,
cor.method = "pearson",
xlab = "Miles/(US) gallon",
ylab = "Weight (1000 lbs)"
)
结果图
05
ROC曲线
ROC曲线利用可视化技术表示二分类的性能。
ROC曲线x轴是FPR,y轴是TPR。
FPR和TPR的计算公式
FPR = FP / (FP + TN)
TPR = TP / (TP + FN)
ROC 曲线是针对所有可能的阈值绘制的图形,y 轴上为 TPR,x 轴上为 FPR。TPR 和 FPR 都在 0 到 1 之间变化。
# 导入数据集
library(tidyverse)
library(DataExplorer)
library(e1071)
library(pROC)
raw_data <- read_csv("./data/binary.csv")
# 数据探索性分析
# 变量缺失率可视化
plot_missing(raw_data)
# 数据集划分
set.seed(123)
partition <- sample(2, nrow(raw_data), replace = TRUE, prob = c(0.7, 0.3))
tdata <- raw_data[partition == 1,]
vdata <- raw_data[partition == 2,]
dim(tdata)
dim(vdata)
vdata_X <- vdata %>% select(-admit)
vdata_Y <- vdata$admit
# 模型建构
# 1 LR模型
LR_fit <- glm(admit ~ ., data = tdata, family = binomial())
# 2 SVM模型
svm_fit <- svm(
admit ~ .,
data = tdata,
kernel = "linear",
cost = 1,
scale = FALSE
)
# 模型评价
LR_predict <- predict(LR_fit, newdata = vdata_X, type = "response")
svm_predict <- predict(svm_fit, newdata = vdata_X, type = "response")
# 使用ROC曲线
par(pty = "s")
lrROC <- roc(
vdata_Y ~ LR_predict,
plot = TRUE,
print.auc = TRUE,
col = "green",
lwd = 4,
legacy.axes = TRUE,
main = "ROC Curves"
)
svmROC <- roc(
vdata_Y ~ svm_predict,
plot = TRUE,
print.auc = TRUE,
col = "blue",
lwd = 4,
print.auc.y = 0.4,
legacy.axes = TRUE,
add = TRUE
)
legend(
"bottomright",
legend = c("Logistic Regression", "SVM"),
col = c("green", "blue"),
lwd = 4
)
ROC曲线结果图
结论:
ROC曲线对比分析,SVM算法比LR算法有微小提升。
学习资料:
https://medium.com/swlh/roc-curve-and-auc-detailed-understanding-and-r-proc-package-86d1430a3191
06
fastshap包
使用shap值对模型做解释性分析。
# 使用shape值对模型做解释性分析
library(fastshap)
library(ranger)
library(tidyverse)
# 生成模拟数据集
trn <- gen_friedman(3000, seed = 101)
X <- subset(trn, select = -y)
set.seed(102)
rfo <- ranger(y ~ ., data = trn)
# Prediction wrapper
pfun <- function(object, newdata) {
predict(object, data = newdata)$predictions
}
# Compute fast (approximate) Shapley values using 10 Monte Carlo repetitions
system.time({ # estimate run time
set.seed(5038)
shap <- explain(rfo, X = X, pred_wrapper = pfun, nsim = 10)
})
shap
shap_imp <- data.frame(
Variable = names(shap),
Importance = apply(shap, MARGIN = 2, FUN = function(x) sum(abs(x)))
)
# 基于Shapley value的特征重要性分析
plot1 <- ggplot(shap_imp, aes(reorder(Variable, Importance), Importance)) +
geom_col(color = "#6723ef", fill = "orange") +
coord_flip() +
xlab("") +
ylab("mean(|Shapley value|)") +
theme_classic()
plot1
# 单样本特征贡献分析
expl <- explain(rfo, X = X,pred_wrapper = pfun, nsim = 10, newdata = X[1L, ])
plot2 <- autoplot(expl, type = "contribution") +
theme_classic()
gridExtra::grid.arrange(plot1, plot2, nrow = 1)
结果图
我创建了R语言群,添加我的微信,备注:姓名-入群,我邀请你进群,一起学习R语言。
如果你想了解数据科学与人工智能,请关注下方公众号~
如果你想找数据工作,请关注下方公众号~
R语言学习专辑:
觉得本文不错,就顺手帮我转发到朋友圈和微信群哦,谢谢。
请关注“恒诺新知”微信公众号,感谢“R语言“,”数据那些事儿“,”老俊俊的生信笔记“,”冷🈚️思“,“珞珈R”,“生信星球”的支持!