Enforce pre-defined validation constraints

In a previous guide, you defined validation constraints ad-hoc when initializing Curator objects.

Often, you want to enforce a pre-defined set of validation constraints, like, e.g., the CELLxGENE curator (Curate AnnData based on the CELLxGENE schema).

This guide shows how to subclass Curator to enforce pre-defined constraints.

Define a custom curator

Consider the example of electronic health records (EHR). We want to ensure that

  1. every record has the fields disease, phenotype, developmental_stage, and age

  2. values for these fields map against specific versions of pre-defined ontologies

The following implementation achieves the goal by subclassing DataFrameCurator.

EHR Curator
import bionty as bt
import pandas as pd
from lamindb.core import DataFrameCurator, logger
from lamindb.core.types import UPathStr

__version__ = "0.1.0"


class EHRCurator(DataFrameCurator):
    """Custom curation flow for electronic health record data."""

    def __init__(self, data: pd.DataFrame | UPathStr):
        # Curate these columns against the specified fields
        DEFAULT_CATEGORICALS = {
            "disease": bt.Disease.name,
            "phenotype": bt.Phenotype.name,
            "developmental_stage": bt.DevelopmentalStage.name,
        }

        # If columns or values are missing, we substitute with these defaults
        DEFAULT_VALUES = {
            "disease": "normal",
            "development_stage": "unknown",
            "phenotype": "unknown",
        }

        # Validate values onto the following ontology versions
        DEFAULT_SOURCES = {
            "disease": bt.Source.get(
                entity="bionty.Disease", name="mondo", version="2023-04-04"
            ),
            "developmental_stage": bt.Source.get(
                entity="bionty.DevelopmentalStage", name="hsapdv", version="2020-03-10"
            ),
            "phenotype": bt.Source.get(
                entity="bionty.Phenotype",
                name="hp",
                version="2023-06-17",
                organism="human",
            ),
        }

        self.data = data

        for col, default in DEFAULT_VALUES.items():
            if col not in self.data.columns:
                self.data[col] = default
            else:
                self.data[col] = self.data[col].fillna(default)

        super().__init__(
            df=self.data,
            categoricals=DEFAULT_CATEGORICALS,
            sources=DEFAULT_SOURCES,
            organism="human",
        )

    def validate(self, organism: str | None = None) -> bool:
        """Validates the internal EHR standard."""
        missing_columns = {"disease", "phenotype", "developmental_stage", "age"} - set(
            self.data.columns
        )
        if missing_columns:
            logger.error(
                f"Columns {', '.join(map(repr, missing_columns))} are missing but required."
            )
            return False

        return DataFrameCurator.validate(self, organism)

Use the custom curator

!lamin init --storage ./subclass-curator --modules bionty
 initialized lamindb: testuser1/subclass-curator
import lamindb as ln
import bionty as bt
import pandas as pd
from ehrcurator import EHRCurator

ln.track("2XEr2IA4n1w40000")
 connected lamindb: testuser1/subclass-curator
 created Transform('2XEr2IA4n1w40000'), started new Run('vWJeycMG...') at 2025-01-17 14:23:39 UTC
 notebook imports: bionty==1.0.0 ehrcurator lamindb==1.0rc1 pandas==2.2.3
# create example DataFrame that has all mandatory columns but one ('patient_age') is wrongly named
data = {
    "disease": [
        "Alzheimer disease",
        "diabetes mellitus",
        "breast cancer",
        "Hypertension",
        "asthma",
    ],
    "phenotype": [
        "Mental deterioration",
        "Hyperglycemia",
        "Tumor growth",
        "Increased blood pressure",
        "Airway inflammation",
    ],
    "developmental_stage": ["Adult", "Adult", "Adult", "Adult", "Child"],
    "patient_age": [70, 55, 60, 65, 12],
}
df = pd.DataFrame(data)
df
Hide code cell output
disease phenotype developmental_stage patient_age
0 Alzheimer disease Mental deterioration Adult 70
1 diabetes mellitus Hyperglycemia Adult 55
2 breast cancer Tumor growth Adult 60
3 Hypertension Increased blood pressure Adult 65
4 asthma Airway inflammation Child 12
ehrcurator = EHRCurator(df)
ehrcurator.validate()
Hide code cell output
 added 3 records with Feature.name for "columns": 'disease', 'phenotype', 'developmental_stage'
✗ Columns 'age' are missing but required.
False
# Fix the name of wrongly spelled column
df.columns = df.columns.str.replace("patient_age", "age")
ehrcurator.validate()
Hide code cell output
 saving validated records of 'disease'
 added 4 records from public with Disease.name for "disease": 'Alzheimer disease', 'asthma', 'breast cancer', 'diabetes mellitus'
 saving validated records of 'phenotype'
 added 3 records from public with Phenotype.name for "phenotype": 'Hyperglycemia', 'Increased blood pressure', 'Mental deterioration'
 mapping "disease" on Disease.name
!   1 term is not validated: 'Hypertension'
    → fix typos, remove non-existent values, or save terms via .add_new_from("disease")
 mapping "phenotype" on Phenotype.name
!   2 terms are not validated: 'Tumor growth', 'Airway inflammation'
    → fix typos, remove non-existent values, or save terms via .add_new_from("phenotype")
 mapping "developmental_stage" on DevelopmentalStage.name
!   2 terms are not validated: 'Adult', 'Child'
    → fix typos, remove non-existent values, or save terms via .add_new_from("developmental_stage")
False
# Use lookup objects to curate the values
disease_lo = bt.Disease.public().lookup()
phenotype_lo = bt.Phenotype.public().lookup()
developmental_stage_lo = bt.DevelopmentalStage.public().lookup()

df["disease"] = df["disease"].replace(
    {"Hypertension": disease_lo.hypertensive_disorder.name}
)
df["phenotype"] = df["phenotype"].replace(
    {
        "Tumor growth": phenotype_lo.neoplasm.name,
        "Airway inflammation": phenotype_lo.bronchitis.name,
    }
)
df["developmental_stage"] = df["developmental_stage"].replace(
    {
        "Adult": developmental_stage_lo.adolescent_stage.name,
        "Child": developmental_stage_lo.child_stage.name,
    }
)

ehrcurator.validate()
Hide code cell output
 saving validated records of 'disease'
 added 1 record from public with Disease.name for "disease": 'hypertensive disorder'
 saving validated records of 'phenotype'
 added 2 records from public with Phenotype.name for "phenotype": 'Bronchitis', 'Neoplasm'
 saving validated records of 'developmental_stage'
 added 2 records from public with DevelopmentalStage.name for "developmental_stage": 'child stage', 'adolescent stage'
 "disease" is validated against Disease.name
 "phenotype" is validated against Phenotype.name
 "developmental_stage" is validated against DevelopmentalStage.name
True
Hide code cell content
!rm -rf subclass-curator
!lamin delete --force subclass-curator
 deleting instance testuser1/subclass-curator