Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit d388d88

Browse filesBrowse files
DOC fix xlabel in Tweedie regression on insurance claims (#30362)
Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
1 parent 0d9fb78 commit d388d88
Copy full SHA for d388d88

File tree

1 file changed

+21
-18
lines changed
Filter options

1 file changed

+21
-18
lines changed

‎examples/linear_model/plot_tweedie_regression_insurance_claims.py

Copy file name to clipboardExpand all lines: examples/linear_model/plot_tweedie_regression_insurance_claims.py
+21-18Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -613,11 +613,11 @@ def score_estimator(
613613

614614
# %%
615615
#
616-
# Finally, we can compare the two models using a plot of cumulated claims: for
616+
# Finally, we can compare the two models using a plot of cumulative claims: for
617617
# each model, the policyholders are ranked from safest to riskiest based on the
618-
# model predictions and the fraction of observed total cumulated claims is
619-
# plotted on the y axis. This plot is often called the ordered Lorenz curve of
620-
# the model.
618+
# model predictions and the cumulative proportion of claim amounts is plotted
619+
# against the cumulative proportion of exposure. This plot is often called
620+
# the ordered Lorenz curve of the model.
621621
#
622622
# The Gini coefficient (based on the area between the curve and the diagonal)
623623
# can be used as a model selection metric to quantify the ability of the model
@@ -627,7 +627,7 @@ def score_estimator(
627627
# Gini coefficient is upper bounded by 1.0 but even an oracle model that ranks
628628
# the policyholders by the observed claim amounts cannot reach a score of 1.0.
629629
#
630-
# We observe that both models are able to rank policyholders by risky-ness
630+
# We observe that both models are able to rank policyholders by riskiness
631631
# significantly better than chance although they are also both far from the
632632
# oracle model due to the natural difficulty of the prediction problem from a
633633
# few features: most accidents are not predictable and can be caused by
@@ -653,11 +653,11 @@ def lorenz_curve(y_true, y_pred, exposure):
653653
ranking = np.argsort(y_pred)
654654
ranked_exposure = exposure[ranking]
655655
ranked_pure_premium = y_true[ranking]
656-
cumulated_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)
657-
cumulated_claim_amount /= cumulated_claim_amount[-1]
658-
cumulated_exposure = np.cumsum(ranked_exposure)
659-
cumulated_exposure /= cumulated_exposure[-1]
660-
return cumulated_exposure, cumulated_claim_amount
656+
cumulative_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)
657+
cumulative_claim_amount /= cumulative_claim_amount[-1]
658+
cumulative_exposure = np.cumsum(ranked_exposure)
659+
cumulative_exposure /= cumulative_exposure[-1]
660+
return cumulative_exposure, cumulative_claim_amount
661661

662662

663663
fig, ax = plt.subplots(figsize=(8, 8))
@@ -669,27 +669,30 @@ def lorenz_curve(y_true, y_pred, exposure):
669669
("Frequency * Severity model", y_pred_product),
670670
("Compound Poisson Gamma", y_pred_total),
671671
]:
672-
ordered_samples, cum_claims = lorenz_curve(
672+
cum_exposure, cum_claims = lorenz_curve(
673673
df_test["PurePremium"], y_pred, df_test["Exposure"]
674674
)
675-
gini = 1 - 2 * auc(ordered_samples, cum_claims)
675+
gini = 1 - 2 * auc(cum_exposure, cum_claims)
676676
label += " (Gini index: {:.3f})".format(gini)
677-
ax.plot(ordered_samples, cum_claims, linestyle="-", label=label)
677+
ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
678678

679679
# Oracle model: y_pred == y_test
680-
ordered_samples, cum_claims = lorenz_curve(
680+
cum_exposure, cum_claims = lorenz_curve(
681681
df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"]
682682
)
683-
gini = 1 - 2 * auc(ordered_samples, cum_claims)
683+
gini = 1 - 2 * auc(cum_exposure, cum_claims)
684684
label = "Oracle (Gini index: {:.3f})".format(gini)
685-
ax.plot(ordered_samples, cum_claims, linestyle="-.", color="gray", label=label)
685+
ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)
686686

687687
# Random baseline
688688
ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline")
689689
ax.set(
690690
title="Lorenz Curves",
691-
xlabel="Fraction of policyholders\n(ordered by model from safest to riskiest)",
692-
ylabel="Fraction of total claim amount",
691+
xlabel=(
692+
"Cumulative proportion of exposure\n"
693+
"(ordered by model from safest to riskiest)"
694+
),
695+
ylabel="Cumulative proportion of claim amounts",
693696
)
694697
ax.legend(loc="upper left")
695698
plt.plot()

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.