利用小提琴图探索帕尔默企鹅数据
利用小提琴图探索帕尔默企鹅数据
代码语言:javascript代码运行次数:0运行复制import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from palmerpenguins import load_penguins
数据探索
代码语言:javascript代码运行次数:0运行复制# 数据展示
penguins = load_penguins()
penguins.head()
image-20240129165915062
代码语言:javascript代码运行次数:0运行复制species:企鹅的种类,包括 Adelie、Chinstrap 和 Gentoo 三种。 island:企鹅所在岛屿的名字,包括 Biscoe、Dream 和 Torgersen 三个岛屿。 bill_length_mm:企鹅的喙长,单位毫米。 bill_depth_mm:企鹅的喙深,单位毫米。 flipper_length_mm:企鹅的鳍长,单位毫米。 body_mass_g:企鹅的体重,单位克。 sex:企鹅的性别,包括 Male 和 Female。
# 数据清洗
penguins = penguins.dropna() # 删除na
species = sorted(penguins["species"].unique()) # 物种列表
y_data = [penguins[penguins["species"] == specie]["bill_length_mm"].values for specie in species] # 获取每个物种的bill_length_mm
代码语言:javascript代码运行次数:0运行复制# 构造抖动点:避免数据点重叠
jitter = 0.04
x_data = [np.array([i] * len(d)) for i, d in enumerate(y_data)]
x_jittered = [x + st.t(df=6, scale=jitter).rvs(len(x)) for x in x_data]
绘制基础小提琴图
代码语言:javascript代码运行次数:0运行复制# 设置基础信息:包括颜色、位置、水平线
# 颜色
BG_WHITE = "#fbf9f4"
GREY_LIGHT = "#b4aea9"
GREY50 = "#7F7F7F"
BLUE_DARK = "#1B2838"
BLUE = "#2a475e"
BLACK = "#282724"
GREY_DARK = "#747473"
RED_DARK = "#850e00"
COLOR_SCALE = ["#1B9E77", "#D95F02", "#7570B3"] # 取自 RColorBrewer R 库中的 Dark2 调色板
# 位置(三个物种的位置,可以是任意位置:例如[-1, 0, 1])
POSITIONS = [0, 1, 2]
# 水平线
HLINES = [40, 50, 60]
代码语言:javascript代码运行次数:0运行复制# 构造基本布局:对于每个物种,依次添加小提琴图、箱线图、抖动数据点
# 初始画布
fig, ax = plt.subplots(figsize= (14, 10))
# 背景色
fig.patch.set_facecolor(BG_WHITE)
ax.set_facecolor(BG_WHITE)
# 水平线:用以参考bill_length_mm的位置
for h in HLINES:
ax.axhline(h, color=GREY50, ls=(0, (5, 5)), alpha=0.8, zorder=0)
# 添加小提琴图框架
violins = ax.violinplot(
y_data,
positions=POSITIONS,
widths=0.45,
bw_method="silverman",
showmeans=False,
showmedians=False,
showextrema=False
)
# 自定义小提琴图(外形设置)
for pc in violins["bodies"]:
pc.set_facecolor("none")
pc.set_edgecolor(BLACK)
pc.set_linewidth(1.4)
pc.set_alpha(1)
# 添加箱线图
medianprops = dict(
linewidth=4,
color=GREY_DARK,
solid_capstyle="butt"
)
boxprops = dict(
linewidth=2,
color=GREY_DARK
)
ax.boxplot(
y_data,
positions=POSITIONS,
showfliers = False, # Do not show the outliers beyond the caps.
showcaps = False, # Do not show the caps
medianprops = medianprops,
whiskerprops = boxprops,
boxprops = boxprops
)
# 添加抖动数据点
for x, y, color in zip(x_jittered, y_data, COLOR_SCALE):
ax.scatter(x, y, s = 100, color=color, alpha=0.4)
添加自定义注释
众所周知,可视化好不好看,全凭注释是否精(花)准(哨)。接下来就是最考验技术的地方了!
代码语言:javascript代码运行次数:0运行复制# 添加平均值标签
means = [y.mean() for y in y_data]
for i, mean in enumerate(means):
# 添加代表平均值的点
ax.scatter(i, mean, s=250, color=RED_DARK, zorder=3)
# 添加连接平均值及其标签的线
ax.plot([i, i + 0.25], [mean, mean], ls="dashdot", color="black", zorder=3)
# 添加平均值标签
ax.text(
i + 0.25,
mean,
r"$\hat{\mu}_{\rm{mean}} = $" + str(round(mean, 2)),
fontsize=13,
va="center",
bbox = dict(
facecolor="white",
edgecolor="black",
boxstyle="round",
pad=0.15
),
zorder=10 # 确保该线位于顶部
)
# 添加均值差的p值信息(多重比较)
tick_len = 0.25 # 首位端刻度长短
ax.plot([0, 0, 1, 1], [62.5 - tick_len, 62.5, 62.5, 62.5 - tick_len], c="black")
ax.plot([0, 0, 2, 2], [65 - tick_len, 65, 65, 65 - tick_len], c="black")
ax.plot([1, 1, 2, 2], [67.5 - tick_len, 67.5, 67.5, 67.5 - tick_len], c="black")
# 添加p值标签
label1 = r"$p_{\rm{Holm-corrected}}$ = 8.42e-14"
label2 = r"$p_{\rm{Holm-corrected}}$ = 4.3e-14"
label3 = r"$p_{\rm{Holm-corrected}}$ = 0.031"
pad = 0.2 # 文本距离实现的距离
ax.text(0.5, 62.5 + pad, label1, fontsize=11, va="bottom", ha="center")
ax.text(1, 65 + pad, label2, fontsize=11, va="bottom", ha="center")
ax.text(1.5, 67.5 + pad, label3, fontsize=11, va="bottom", ha="center")
fig
为图表增加更丰富的信息
代码语言:javascript代码运行次数:0运行复制# 自定义布局
# 隐藏右边/上边的框
ax.spines["right"].set_color("none")
ax.spines["top"].set_color("none")
# 定义边框颜色和线宽
ax.spines["left"].set_color(GREY_LIGHT)
ax.spines["left"].set_linewidth(2)
ax.spines["bottom"].set_color(GREY_LIGHT)
ax.spines["bottom"].set_linewidth(2)
# 自定义标签和刻度
ax.tick_params(length=0)
ax.set_yticks(HLINES)
ax.set_yticklabels(HLINES, size=15)
ax.set_ylabel("Bill Length", size=18, weight="bold")
# x轴上添加各物种数量信息
xlabels = [f"{specie}\n(n={y_data[i].size})" for i, specie in enumerate(species)]
ax.set_xticks(POSITIONS)
ax.set_xticklabels(xlabels, size=15, ha="center", ma="center")
ax.set_xlabel("Penguin Species", size=18, weight="bold")
# 自定义标题与副标题
# 标题
stats = [
r"$\log_{\rm{e}}(\rm{BF}_{01})=-195.59$",
r"$\widehat{R^2}_{\rm{Bayesian}}^{\rm{posterior}}=0.70$",
r"$\rm{CI}_{95\%}^{\rm{HDI}}[0.67, 0.73]$",
r"$r^{\rm{Cauchy}}_{\rm{JZS}} = 0.71$",
]
fig.suptitle(
"Distribution of bill length across penguins species",
x = 0.122,
y = 0.975,
ha="left",
fontsize=26,
fontname="Lobster Two",
color=BLUE,
weight="bold",
)
# 子标题
stats = [
r"$F_{\rm{Welch}}$(2, 165.34)=409.93",
r"p=8.27e-65",
r"$\widehat{\omega_p^2}$=0.83",
r"CI$_{95\%}$[0.79, 0.86]",
r"n$_{\rm{obs}}$=333"
]
ax.set_title(
", ".join(stats),
loc="left",
ha="left",
fontsize=20,
color=BLUE_DARK
)
# 右下角添加文本注释
fig.text(
0.55,
0.03,
", ".join(stats),
fontsize=10
)
fig.text(
0.55,
0.005,
r"Pairwise test: $\bf{Games-Howell}$ $\bf{test}$; Comparisons shown: $\bf{Only}$ $\bf{significant}$",
fontsize=10
)
fig
参考:Palmer Penguins exploration with violinplots in Matplotlib[1]
共勉~
参考资料
[1]
Palmer Penguins exploration with violinplots in Matplotlib: /
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。原始发表:2025-04-22,如有侵权请联系 cloudcommunity@tencent 删除dataset布局基础数据