Skip to content

How to use biodem.dem

1. Nested cross-validation and data preprocessing

Please checkout the documentations at Modules > Utilities > Preprocessing Data > OptimizeLitdataNCV.

This is an example of running the module:

run_dem_prep.py
import os
import sys
from biodem import OptimizeLitdataNCV


trait_name = sys.argv[1]
which_o = int(sys.argv[2])
which_i = int(sys.argv[3])

if trait_name.startswith("all"):
    which_trait = None
else:
    which_trait = [trait_name]


if __name__ == "__main__":
    k_outer = 10
    k_inner = 5
    dir_home = os.path.dirname(os.path.abspath(__file__))
    output_dir = os.path.join(dir_home, "run_dem", trait_name, "litdata")

    path_labels = os.path.join(dir_home, "data_prep", "phenotypes.csv")

    path_transformed_genotypes = os.path.join(dir_home, "run_s2g", trait_name, "transf")
    path_metabolome = os.path.join(dir_home, "data_prep", "omics_metabolome.parquet")
    path_fpkm = os.path.join(dir_home, "data_prep", "omics_fpkm_log2.parquet")

    dict_omics = {
        "transcriptome": path_fpkm,
        "metabolome": path_metabolome,
        "genotype": path_transformed_genotypes,
    }

    _opt = OptimizeLitdataNCV(
        paths_omics = dict_omics,
        path_label = path_labels,
        output_dir = output_dir,
        k_outer = k_outer,
        k_inner = k_inner,
        which_outer_inner = [which_o, which_i],
        col2use_in_labels = which_trait,
    )
    _opt.run_optimization()

2. Dual-extraction modeling

Please checkout the documentations at Modules > DEM > Pipeline > DEMFitPipe.

This is an example of running the module:

run_dem_fit.py
import os
import sys
from biodem import DEMFitPipe


if len(sys.argv) < 2:
    raise ValueError('Please specify the TRAIT NAME')
trait_name = sys.argv[1]

if len(sys.argv) < 4:
    print('Start default NCV: 10 outer folds and 5 inner folds')
    list_ncv = [[i,j] for i in range(10) for j in range(5)]
else:
    print('Start with NCV: {} outer folds and {} inner folds'.format(sys.argv[2], sys.argv[3]))
    list_ncv = [[int(sys.argv[2]), int(sys.argv[3])]]

work_dir_home = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run_dem", trait_name)
litdata_dir = os.path.join(work_dir_home, 'litdata')
is_regression = True
log_dir = os.path.join(work_dir_home, 'models')


if __name__ == '__main__':
    _pipe = DEMFitPipe(
        litdata_dir=litdata_dir,
        list_ncv=list_ncv,
        log_dir=log_dir,
        regression=is_regression,
    )
    _pipe.train_pipeline()

3. Feature ranking

Please checkout the documentations at Modules > DEM > Feature ranking > DEMFeatureRanking.

This is an example of running the module:

run_dem_rank.py
import os
import sys
from biodem import DEMFeatureRanking


if len(sys.argv) < 3:
    raise ValueError('Please input trait name and outer index')
trait_name = sys.argv[1]
which_outer = int(sys.argv[2])

work_dir_home = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run_dem", trait_name)
litdata_dir = os.path.join(work_dir_home, 'litdata')
log_dir = os.path.join(work_dir_home, 'models')

rank_result_path = os.path.join(work_dir_home, "feature_rank", f"rank_result_{trait_name}_outer+{which_outer}.csv")
random_seeds = [1000+i for i in range(20)]


if __name__ == "__main__":
    _feat_rank = DEMFeatureRanking()
    _feat_rank.run_a_outer(
        ncv_litdata_dir = litdata_dir,
        fit_log_dir = log_dir,
        which_outer = which_outer,
        output_path = rank_result_path,
        random_states = random_seeds,
    )

    # Collect ranks if several outer testset ranking results already exist
    # _feat_rank.collect_ranks(os.path.dirname(rank_result_path), os.path.join(os.path.dirname(rank_result_path), "rank_merged_sorted.csv"))