使用spotifyr聚类Springsteen专辑
笔者邀请您,先思考:
1 如何对数据集做相关性分析和聚类分析?
spotifyr包很棒,让我们来探索音乐的各个方面,如节奏、舞蹈性和化合价。在这篇文章中,我们将从相同点和不同点来探讨布鲁斯·斯普林斯汀的专辑。
1# devtools::install_github('charlie86/spotifyr')
2
3library(spotifyr)
4library(tidyverse)
5library(magrittr)
6library(ggridges)
7library(ggcorrplot)
8library(viridisLite)
9library(factoextra)
10library(ggiraphExtra)
使用get_artist_audio_features()函数很容易获得数据。在这里,我们将从csv文件中加载它并查看。
1# df <- get_artist_audio_features(artist = "bruce springsteen")
2
3df <- read_csv("https://raw.github.com/peerchristensen/Springsteen_album_clusters/master/springsteen_albums.csv")
4
5glimpse(df)
1## Observations: 537
2## Variables: 31
3## $ artist_name <chr> "Bruce Springsteen", "Bruce Springsteen...
4## $ artist_uri <chr> "3eqjTLE0HfPfh78zjh6TqT", "3eqjTLE0HfPf...
5## $ album_uri <chr> "0PMasrHdpaoIRuHuhHp72O", "0PMasrHdpaoI...
6## $ album_name <chr> "Born In The U.S.A.", "Born In The U.S....
7## $ album_img <chr> "https://i.scdn.co/image/d002b63ceb5658...
8## $ album_type <chr> "album", "album", "album", "album", "al...
9## $ is_collaboration <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
10## $ album_release_date <chr> "1984-06-04", "1984-06-04", "1984-06-04...
11## $ album_release_year <date> 1984-06-04, 1984-06-04, 1984-06-04, 19...
12## $ album_popularity <dbl> 76, 76, 76, 76, 76, 76, 76, 76, 76, 76,...
13## $ track_name <chr> "Born in the U.S.A.", "Cover Me", "Darl...
14## $ track_uri <chr> "0dOg1ySSI7NkpAe89Zo0b9", "4U7NhC2rQTAh...
15## $ track_number <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...
16## $ disc_number <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
17## $ danceability <dbl> 0.398, 0.535, 0.536, 0.429, 0.544, 0.62...
18## $ energy <dbl> 0.952, 0.884, 0.982, 0.949, 0.762, 0.44...
19## $ key <chr> "E", "A", "G", "C", "A#", "C#", "F", "A...
20## $ loudness <dbl> -6.042, -5.499, -4.674, -5.295, -7.289,...
21## $ mode <chr> "major", "minor", "major", "major", "ma...
22## $ speechiness <dbl> 0.0610, 0.0407, 0.0389, 0.0458, 0.0382,...
23## $ acousticness <dbl> 0.000373, 0.001880, 0.014100, 0.084200,...
24## $ instrumentalness <dbl> 7.75e-05, 1.26e-03, 3.67e-05, 0.00e+00,...
25## $ liveness <dbl> 0.1000, 0.1400, 0.2740, 0.1540, 0.0740,...
26## $ valence <dbl> 0.584, 0.796, 0.963, 0.967, 0.473, 0.86...
27## $ tempo <dbl> 122.093, 120.555, 119.201, 184.286, 120...
28## $ duration_ms <dbl> 278680, 205987, 288027, 192267, 215427,...
29## $ time_signature <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...
30## $ key_mode <chr> "E major", "A minor", "G major", "C maj...
31## $ track_popularity <dbl> 72, 51, 45, 47, 49, 71, 50, 47, 53, 62,...
32## $ track_preview_url <chr> "https://p.scdn.co/mp3-preview/3b6a5b91...
33## $ track_open_spotify_url <chr> "https://open.spotify.com/track/0dOg1yS...
我们只需要做一点清洗和删除一些非录音室的专辑。
1# some albums only have one song, some are alternate versions
2
3remove_albums <- c("Greatest Hits",
4 "Hammersmith Odeon, London 75",
5 "The Essential Bruce Springsteen (Bonus Disc)",
6 "The Ties That Bind: The River Collection",
7 "Chapter and Verse",
8 "The Promise",
9 "Tracks")
10
11df %<>%
12 filter(!album_name %in% remove_albums,
13 !grepl("live|Live",album_name)) %>%
14 mutate(album_name = str_to_title(album_name))
15
16df$album_name <- gsub(":.*","",df$album_name)
17df$album_name[grepl("Innocent",df$album_name)] <- "The Wild, The Innocent.."
18df$album_name[grepl("Greetings",df$album_name)] <- "Greetings"
19df$album_name[grepl("Darkness",df$album_name)] <- "Darkness"
让我们先来看看Springsteen歌曲中最常用的五个键。
1df %>%
2 select(key_mode) %>%
3 group_by(key_mode) %>%
4 count() %>%
5 arrange(desc(n)) %>%
6 ungroup() %>%
7 top_n(5) %>%
8 mutate(ordered = row_number()) %>%
9
10 ggplot(aes(x = reorder(key_mode,desc(ordered)), y = n, fill = n)) +
11 geom_col() +
12 coord_flip() +
13 ggtitle("Five most common keys") +
14 scale_fill_viridis_c(option="B", direction = -1,guide=F) +
15 theme_minimal() +
16 labs(y = "n",x = "key")

正如我们所看到的,spotifyr从spotify API获取了许多有趣的数据。让我们先来看看每张专辑的舞蹈性。“天生就会跑步”的可舞性最低,而“爱的隧道”的可舞性最高。
1df %>%
2 group_by(album_name) %>%
3
4 ggplot(aes(x = danceability,
5 y = reorder(album_name,desc(album_release_year)),
6 fill = reorder(album_name,desc(album_release_year)))) +
7 geom_density_ridges(colour = "snow") +
8 scale_fill_viridis_d(option = "B", begin = .05, direction = -1, guide = F) +
9 theme_minimal() +
10 ggtitle("Danceability") +
11 labs(y="album")

让我们把所有的特征放在同一个图中。
1df %>%
2 gather(key = feature, value = measure,
3 danceability, energy, loudness, valence, tempo, acousticness) %>%
4 group_by(album_name) %>%
5
6 ggplot(aes(x = measure,
7 y = reorder(album_name,desc(album_release_year)),
8 fill = album_release_date)) +
9 geom_density_ridges(rel_min_height = 0.005, legend = F, alpha = .9, size = .2, colour = "snow") +
10 facet_wrap(~feature, scales = "free", ncol = 2) +
11 scale_fill_viridis_d(option ="B" ,begin = .05) +
12 theme_minimal() +
13 theme(axis.text.y = element_text(size = 7)) +
14 labs(y = "album name") +
15 ggtitle("Springsteen albums in six features",
16 subtitle = "Acousticness, danceability, energy, loudness, tempo and valence") +
17 guides(fill = FALSE)

将各个特征之间的相关性形象化也会很有趣。energy和loudness是正相关的,而acousticness和loudness是负相关的,这不足为奇。
1sign_test <- df %>%
2 select(acousticness,danceability,energy,loudness,tempo,valence) %>%
3 cor_pmat()
4
5df %>%
6 select(acousticness,danceability,energy,loudness,tempo,valence) %>%
7 cor() %>%
8 ggcorrplot(type = "lower",
9 p.mat = sign_test,
10 colors = c(inferno(5)[2], "snow", inferno(5)[4])) +
11 ggtitle("Correlations between features",
12 subtitle = "Non-significant correlations marked with X")

基于这些特征,我们还可以探索专辑在距离矩阵中的相似性。在这幅图中,橙色表示专辑之间的高度差异或很大的“距离”。
1dfScale <- df %>%
2 select(album_name,acousticness,danceability,energy,loudness,tempo,valence) %>%
3 group_by(album_name) %>%
4 summarise(acousticness = mean(scale(acousticness)),
5 danceability = mean(scale(danceability)),
6 energy = mean(scale(energy)),
7 loudness = mean(scale(loudness)),
8 tempo = mean(scale(tempo)),
9 valence = mean(scale(valence))) %>%
10 data.frame()
11
12row.names(dfScale) <- dfScale$album_name
13
14dfScale %<>%
15 select(-album_name) %>%
16 data.frame()
17
18df_dist <- get_dist(dfScale, stand = TRUE)
19
20fviz_dist(df_dist,gradient = list(low = inferno(5)[2], mid = "white", high = inferno(5)[4])) +
21 theme_minimal() +
22 ggtitle("Distance matrix",
23 subtitle = "Similarity between albums based on all features") +
24 theme(axis.text.x = element_text(hjust = 1,angle = 45),
25 axis.title = element_blank())
26

为了获得更清晰的图像,我们可以使用ggiraphExtra包中的雷达图来探索专辑和特征之间的模式。
1dfScale %>%
2 mutate(albums = row.names(dfScale)) %>%
3 ggRadar(aes(group = albums),
4 rescale = FALSE, legend.position = "none",
5 size = 1, interactive = FALSE, use.label = TRUE) +
6 facet_wrap(~albums) +
7 scale_y_discrete(breaks = NULL) +
8 theme(axis.text.x = element_text(size = 10)) +
9 theme_minimal() +
10 theme(legend.position = "none") +
11 scale_fill_viridis_d(option="B") +
12 scale_colour_viridis_d(option="B")

最后一步,我们将了解如何使用分层和k-means聚类根据各种特征对专辑进行分组。我们首先使用factoExtra包中的fviz_nbclust()函数来计算聚类的最优数量。注意,函数中包含不同的方法来计算聚类的数量。默认情况下使用“silhouette”方法。
1fviz_nbclust(dfScale, hcut) +
2 ggtitle("Optimal Number of Clusters: H-Clustering")

1df.hc <- hclust(dist(scale(dfScale)))
2
3fviz_dend(df.hc, k = 3,
4 cex = .9,
5 k_colors = inferno(10)[c(4,7)],
6 color_labels_by_k = TRUE,
7 rect = TRUE) +
8 ggtitle("Hierachical Clustering")
9

1fviz_nbclust(dfScale, kmeans) +
2 ggtitle("Optimal Number of Clusters: K-means Clustering")

1set.seed(324789)
2km.res <- kmeans(dfScale, 2, nstart = 25)
3
4fviz_cluster(km.res, data = dfScale,
5 ellipse.type = "convex",
6 repel = T,
7 palette = inferno(10)[c(4,6,8)],
8 ggtheme = theme_minimal(),
9 main = "K-means Clustering")
10

作者:Peer Christensen
原文链接:
https://peerchristensen.netlify.com/post/clustering-springsteen-albums-with-spotifyr/
内容推荐
数据人网:数据人学习,交流和分享的平台,诚邀您创造和分享数据知识,共建和共享数据智库。
请关注“恒诺新知”微信公众号,感谢“R语言“,”数据那些事儿“,”老俊俊的生信笔记“,”冷🈚️思“,“珞珈R”,“生信星球”的支持!