{
  "id": "2411.12377",
  "version": "v1",
  "published": "2024-11-19T09:53:28.000Z",
  "updated": "2024-11-19T09:53:28.000Z",
  "title": "Non-IID data in Federated Learning: A Systematic Review with Taxonomy, Metrics, Methods, Frameworks and Future Directions",
  "authors": [
    "Daniel M. Jimenez G.",
    "David Solans",
    "Mikko Heikkila",
    "Andrea Vitaletti",
    "Nicolas Kourtellis",
    "Aris Anagnostopoulos",
    "Ioannis Chatzigiannakis"
  ],
  "categories": [
    "cs.LG"
  ],
  "abstract": "Recent advances in machine learning have highlighted Federated Learning (FL) as a promising approach that enables multiple distributed users (so-called clients) to collectively train ML models without sharing their private data. While this privacy-preserving method shows potential, it struggles when data across clients is not independent and identically distributed (non-IID) data. The latter remains an unsolved challenge that can result in poorer model performance and slower training times. Despite the significance of non-IID data in FL, there is a lack of consensus among researchers about its classification and quantification. This systematic review aims to fill that gap by providing a detailed taxonomy for non-IID data, partition protocols, and metrics to quantify data heterogeneity. Additionally, we describe popular solutions to address non-IID data and standardized frameworks employed in FL with heterogeneous data. Based on our state-of-the-art review, we present key lessons learned and suggest promising future research directions.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2024-11-19T09:53:28.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "federated learning",
      "frameworks",
      "directions",
      "enables multiple distributed users",
      "systematic review aims"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}