@@ -613,11 +613,11 @@ def score_estimator(
613
613
614
614
# %%
615
615
#
616
- # Finally, we can compare the two models using a plot of cumulated claims: for
616
+ # Finally, we can compare the two models using a plot of cumulative claims: for
617
617
# each model, the policyholders are ranked from safest to riskiest based on the
618
- # model predictions and the fraction of observed total cumulated claims is
619
- # plotted on the y axis . This plot is often called the ordered Lorenz curve of
620
- # the model.
618
+ # model predictions and the cumulative proportion of claim amounts is plotted
619
+ # against the cumulative proportion of exposure . This plot is often called
620
+ # the ordered Lorenz curve of the model.
621
621
#
622
622
# The Gini coefficient (based on the area between the curve and the diagonal)
623
623
# can be used as a model selection metric to quantify the ability of the model
@@ -627,7 +627,7 @@ def score_estimator(
627
627
# Gini coefficient is upper bounded by 1.0 but even an oracle model that ranks
628
628
# the policyholders by the observed claim amounts cannot reach a score of 1.0.
629
629
#
630
- # We observe that both models are able to rank policyholders by risky-ness
630
+ # We observe that both models are able to rank policyholders by riskiness
631
631
# significantly better than chance although they are also both far from the
632
632
# oracle model due to the natural difficulty of the prediction problem from a
633
633
# few features: most accidents are not predictable and can be caused by
@@ -653,11 +653,11 @@ def lorenz_curve(y_true, y_pred, exposure):
653
653
ranking = np .argsort (y_pred )
654
654
ranked_exposure = exposure [ranking ]
655
655
ranked_pure_premium = y_true [ranking ]
656
- cumulated_claim_amount = np .cumsum (ranked_pure_premium * ranked_exposure )
657
- cumulated_claim_amount /= cumulated_claim_amount [- 1 ]
658
- cumulated_exposure = np .cumsum (ranked_exposure )
659
- cumulated_exposure /= cumulated_exposure [- 1 ]
660
- return cumulated_exposure , cumulated_claim_amount
656
+ cumulative_claim_amount = np .cumsum (ranked_pure_premium * ranked_exposure )
657
+ cumulative_claim_amount /= cumulative_claim_amount [- 1 ]
658
+ cumulative_exposure = np .cumsum (ranked_exposure )
659
+ cumulative_exposure /= cumulative_exposure [- 1 ]
660
+ return cumulative_exposure , cumulative_claim_amount
661
661
662
662
663
663
fig , ax = plt .subplots (figsize = (8 , 8 ))
@@ -669,27 +669,30 @@ def lorenz_curve(y_true, y_pred, exposure):
669
669
("Frequency * Severity model" , y_pred_product ),
670
670
("Compound Poisson Gamma" , y_pred_total ),
671
671
]:
672
- ordered_samples , cum_claims = lorenz_curve (
672
+ cum_exposure , cum_claims = lorenz_curve (
673
673
df_test ["PurePremium" ], y_pred , df_test ["Exposure" ]
674
674
)
675
- gini = 1 - 2 * auc (ordered_samples , cum_claims )
675
+ gini = 1 - 2 * auc (cum_exposure , cum_claims )
676
676
label += " (Gini index: {:.3f})" .format (gini )
677
- ax .plot (ordered_samples , cum_claims , linestyle = "-" , label = label )
677
+ ax .plot (cum_exposure , cum_claims , linestyle = "-" , label = label )
678
678
679
679
# Oracle model: y_pred == y_test
680
- ordered_samples , cum_claims = lorenz_curve (
680
+ cum_exposure , cum_claims = lorenz_curve (
681
681
df_test ["PurePremium" ], df_test ["PurePremium" ], df_test ["Exposure" ]
682
682
)
683
- gini = 1 - 2 * auc (ordered_samples , cum_claims )
683
+ gini = 1 - 2 * auc (cum_exposure , cum_claims )
684
684
label = "Oracle (Gini index: {:.3f})" .format (gini )
685
- ax .plot (ordered_samples , cum_claims , linestyle = "-." , color = "gray" , label = label )
685
+ ax .plot (cum_exposure , cum_claims , linestyle = "-." , color = "gray" , label = label )
686
686
687
687
# Random baseline
688
688
ax .plot ([0 , 1 ], [0 , 1 ], linestyle = "--" , color = "black" , label = "Random baseline" )
689
689
ax .set (
690
690
title = "Lorenz Curves" ,
691
- xlabel = "Fraction of policyholders\n (ordered by model from safest to riskiest)" ,
692
- ylabel = "Fraction of total claim amount" ,
691
+ xlabel = (
692
+ "Cumulative proportion of exposure\n "
693
+ "(ordered by model from safest to riskiest)"
694
+ ),
695
+ ylabel = "Cumulative proportion of claim amounts" ,
693
696
)
694
697
ax .legend (loc = "upper left" )
695
698
plt .plot ()
0 commit comments