@revista_internacional{1028, keywords = {Thalassemia screening, Supervised machine learning algorithm, Multi-class classification}, author = {Subrata Saha and Prashant Sharma and Atul Kumar Jain and Bapi Dutta and Luis Martínez and Sarkaft Saleh and Tuphan Kanti Dolai and Anilava Kaviraj and Tanmay Sanyal and Izabela Nielsen and Reena Das}, title = {Detection of β-Thalassemia trait from a heterogeneous population with red cell indices and parameters}, abstract = {Background: India is home to about 42 million people with β-thalassemia trait (βTT) necessitating screening of βTT to stop spread of the disease. Over the years, researchers developed discrimination formulae based on red blood cell (RBC) parameters to screen β-thalassemia trait from iron deficiency anemia (IDA). However, the screening programs often encounter normal subjects (NSs) with other hemoglobinopathy variants. Because the outcome of existing formulas is binary, they often club normal subjects (NS) or variants such as Hemoglobin E (HbE) traits with either βTT or IDA. Therefore, it is necessary to segregate βTT, IDA, HbE, and NS in mixed population data for rational screening. Methods: A test data of 2877 subjects with 1226 NS, 425 HbE, 223 IDA, and 1003 βTT were collected from the Postgraduate Institute of Medical Education and Research (PGIMER), Chandigarh, India and NRS Medical College and Hospital, Kolkata, India. First, we evaluated the performance of 25 discrimination formulae and four machine learning algorithms (MLA), Multi-Layer Perceptron (MLP), Neighborhood Components Analysis (NCA), eXtreme Gradient Boosting Classifier (XGBC), and SKope-Rules (SKR) based on seven performance measures. Based on the performance measures, we selected four discrimination formulae and two MLAs for further evaluation. The SHapley Additive exPlanations (SHAP) model was employed to explore the interpretability of outcomes. We generated four rules using the SKR algorithm to discriminate variants of hemoglobinopathies. Finally, a step-wise implementation scheme for screening is proposed. Results: Results demonstrate that a single formula cannot ensure high performance for all the performance measures. When tested on data set containing βTT and IDA samples, the best-performing formulae appear as SCSβTT in terms of sensitivity (SE) and negative predictive value (NPV); Sirachainan in terms of specificity (SP) and positive predictive value (PPV); CRUISE in terms of Youden index (YI) and RF-4 in terms of Matthews correlation coefficient (MCC) and κ-coefficient, respectively. Among MLAs, the best-performing algorithms are Skope-rule regarding SP, YI, PPV, and XGBC in the rest of the measures. When tested on a heterogeneous data set, MCC and κ-coefficient for these four formulae are decreased, but the performance of the two MLAs remains steady. The proposed scheme demonstrates around 97.33–97.62% accuracy while applied to two validation data sets collected from different sources. Conclusion: The performances of XGBC and SKR algorithms for multi-class classification remain steady while segregating different variants of hemoglobinopathies. The developed rules may be helpful for pre-screening individuals and a possible solution for screening in a mixed population with multiple variants for sustainable, cost-effective, and resource-saving screening.}, year = {2025}, journal = {Computers in Biology and Medicine}, volume = {192}, pages = {110151}, issn = {0010-4825}, url = {https://www.sciencedirect.com/science/article/pii/S0010482525005025}, doi = {https://doi.org/10.1016/j.compbiomed.2025.110151}, }