Source code for pl.pe.parest

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2018 Piero Dalle Pezze
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


# for computing the pipeline elapsed time 
import datetime
import glob
import logging
import os
import yaml
import traceback
from sbpipe.report.latex_reports import latex_report_pe, pdf_report
from sbpipe.utils.dependencies import is_r_package_installed
from sbpipe.utils.io import refresh
from sbpipe.utils.parcomp import parcomp
from sbpipe.utils.rand import get_rand_alphanum_str
from ..pipeline import Pipeline

logger = logging.getLogger('sbpipe')


[docs]class ParEst(Pipeline): """ This module provides the user with a complete pipeline of scripts for running model parameter estimations """
[docs] def __init__(self, models_folder='Models', working_folder='Results', sim_data_folder='param_estim_data', sim_plots_folder='param_estim_plots'): __doc__ = Pipeline.__init__.__doc__ Pipeline.__init__(self, models_folder, working_folder, sim_data_folder, sim_plots_folder)
[docs] def run(self, config_file): __doc__ = Pipeline.run.__doc__ logger.info("==============================") logger.info("Pipeline: parameter estimation") logger.info("==============================") logger.info("\n") logger.info("Loading file: " + config_file) logger.info("=============\n") # load the configuration file try: config_dict = Pipeline.load(config_file) except yaml.YAMLError as e: logger.error(e.message) logger.debug(traceback.format_exc()) return False except IOError: logger.error('File `' + config_file + '` does not exist.') logger.debug(traceback.format_exc()) return False # variable initialisation (generate_data, analyse_data, generate_report, generate_tarball, project_dir, simulator, model, cluster, local_cpus, round, runs, best_fits_percent, data_point_num, plot_2d_66cl_corr, plot_2d_95cl_corr, plot_2d_99cl_corr, logspace, scientific_notation) = self.parse(config_dict) runs = int(runs) #round = int(round) local_cpus = int(local_cpus) best_fits_percent = float(best_fits_percent) data_point_num = int(data_point_num) models_dir = os.path.join(project_dir, self.get_models_folder()) working_dir = os.path.join(project_dir, self.get_working_folder()) output_folder = os.path.splitext(model)[0] + "__round_" + str(round) outputdir = os.path.join(working_dir, output_folder) fileout_final_estims = "final_estim_collection.csv" fileout_all_estims = "all_estim_collection.csv" fileout_param_estim_best_fits_details = "param_estim_best_fits_details.csv" fileout_param_estim_details = "param_estim_details.csv" fileout_param_estim_summary = "param_estim_summary.csv" # Get the pipeline start time start = datetime.datetime.now().replace(microsecond=0) # preprocessing if not os.path.exists(outputdir): os.makedirs(outputdir) if generate_data: logger.info("\n") logger.info("Data generation:") logger.info("================") status = ParEst.generate_data(simulator, model, models_dir, cluster, local_cpus, runs, outputdir, os.path.join(outputdir, self.get_sim_data_folder())) if not status: return False if analyse_data: logger.info("\n") logger.info("Data analysis:") logger.info("==============") status = ParEst.analyse_data(simulator, os.path.splitext(model)[0], os.path.join(outputdir, self.get_sim_data_folder()), outputdir, fileout_final_estims, fileout_all_estims, fileout_param_estim_best_fits_details, fileout_param_estim_details, fileout_param_estim_summary, os.path.join(outputdir, self.get_sim_plots_folder()), best_fits_percent, data_point_num, cluster, plot_2d_66cl_corr, plot_2d_95cl_corr, plot_2d_99cl_corr, logspace, scientific_notation) if not status: return False if generate_report: logger.info("\n") logger.info("Report generation:") logger.info("==================") status = ParEst.generate_report(os.path.splitext(model)[0], outputdir, self.get_sim_plots_folder()) if not status: return False if generate_tarball: status = self.generate_tarball(working_dir, output_folder) if not status: return False # Print the pipeline elapsed time end = datetime.datetime.now().replace(microsecond=0) logger.info("\n\nPipeline elapsed time (using Python datetime): " + str(end - start)) return True
[docs] @classmethod def generate_data(cls, simulator, model, inputdir, cluster, local_cpus, runs, outputdir, sim_data_dir): """ The first pipeline step: data generation. :param simulator: the name of the simulator (e.g. Copasi) :param model: the model to process :param inputdir: the directory containing the model :param cluster: local, lsf for load sharing facility, sge for sun grid engine :param local_cpus: the number of cpu :param runs: the number of fits to perform :param outputdir: the directory to store the results :param sim_data_dir: the directory containing the simulation data sets :return: True if the task was completed successfully, False otherwise. """ if int(local_cpus) < 1: logger.error("variable local_cpus must be greater than 0. Please, check your configuration file.") return False if int(runs) < 1: logger.error("variable nfits must be greater than 0. Please, check your configuration file.") return False if not os.path.isfile(os.path.join(inputdir, model)): logger.error(os.path.join(inputdir, model) + " does not exist.") return False # folder preparation refresh(sim_data_dir, os.path.splitext(model)[0]) try: sim = cls.get_simul_obj(simulator) except TypeError as e: logger.error("simulator: " + simulator + " not found.") logger.debug(traceback.format_exc()) return False try: return sim.pe(model, inputdir, cluster, local_cpus, runs, outputdir, sim_data_dir) except Exception as e: logger.error(str(e)) logger.debug(traceback.format_exc()) return False
[docs] @classmethod def analyse_data(cls, simulator, model, inputdir, outputdir, fileout_final_estims, fileout_all_estims, fileout_param_estim_best_fits_details, fileout_param_estim_details, fileout_param_estim_summary, sim_plots_dir, best_fits_percent, data_point_num, cluster='local', plot_2d_66cl_corr=False, plot_2d_95cl_corr=False, plot_2d_99cl_corr=False, logspace=True, scientific_notation=True): """ The second pipeline step: data analysis. :param simulator: the name of the simulator (e.g. Copasi) :param model: the model name :param inputdir: the directory containing the simulation data :param outputdir: the directory to store the results :param fileout_final_estims: the name of the file containing final parameter sets with the objective value :param fileout_all_estims: the name of the file containing all the parameter sets with the objective value :param fileout_param_estim_best_fits_details: the file containing the statistics for the best fits analysis :param fileout_param_estim_details: the file containing the statistics for the PLE analysis :param fileout_param_estim_summary: the name of the file containing the summary for the parameter estimation :param sim_plots_dir: the directory of the simulation plots :param best_fits_percent: the percent to consider for the best fits :param data_point_num: the number of data points :param cluster: local, lsf for Load Sharing Facility, sge for Sun Grid Engine. :param plot_2d_66cl_corr: True if 2 dim plots for the parameter sets within 66% should be plotted :param plot_2d_95cl_corr: True if 2 dim plots for the parameter sets within 95% should be plotted :param plot_2d_99cl_corr: True if 2 dim plots for the parameter sets within 99% should be plotted :param logspace: True if parameters should be plotted in log space :param scientific_notation: True if axis labels should be plotted in scientific notation :return: True if the task was completed successfully, False otherwise. """ if not os.path.exists(inputdir) or not os.listdir(inputdir): logger.error("inputdir " + inputdir + " does not exist or is empty. Generate some data first.") return False if int(best_fits_percent) < 1 or int(best_fits_percent) > 100: logger.error("variable `best_fits_percent` must be in (0, 100]. Please, check your configuration file.") return False if int(data_point_num) < 0: logger.error("variable `data_point_num` must be >= 0. To visualise thresholds, `data_point_num` must be " "greater than the number of estimated parameters. Please, check your configuration file.") return False refresh(sim_plots_dir, os.path.splitext(model)[0]) logger.info("Collect results:") # Collect and summarises the parameter estimation results try: sim = cls.get_simul_obj(simulator) files_num = sim.get_best_fits(inputdir, outputdir, fileout_final_estims) sim.get_all_fits(inputdir, outputdir, fileout_all_estims) logger.info('Files retrieved: ' + str(files_num)) if files_num == 0: return False except Exception as e: logger.error("simulator: " + simulator + " not found.") import traceback logger.debug(traceback.format_exc()) return False # we don't replace any string in files. So let's use a substring which won't even be in any file. str_to_replace = get_rand_alphanum_str(10) logger.info("\n") logger.info("Fits analysis:") # requires devtools::install_github("pdp10/sbpiper") if not is_r_package_installed('sbpiper'): logger.critical('R package `sbpiper` was not found. Abort.') return False command = 'R --quiet -e \'library(sbpiper); sbpiper_pe(\"' + model + \ '\", \"' + os.path.join(outputdir, fileout_final_estims) + \ '\", \"' + os.path.join(outputdir, fileout_all_estims) + \ '\", \"' + sim_plots_dir + \ '\", ' + str(data_point_num) + \ ', \"' + os.path.join(outputdir, fileout_param_estim_best_fits_details) + \ '\", \"' + os.path.join(outputdir, fileout_param_estim_details) + \ '\", \"' + os.path.join(outputdir, fileout_param_estim_summary) + \ '\", ' + str(best_fits_percent) + \ ', ' + str(plot_2d_66cl_corr).upper() + \ ', ' + str(plot_2d_95cl_corr).upper() + \ ', ' + str(plot_2d_99cl_corr).upper() + \ ', ' + str(logspace).upper() + \ ', ' + str(scientific_notation).upper() # we replace \\ with / otherwise subprocess complains on windows systems. command = command.replace('\\', '\\\\') # We do this to make sure that characters like [ or ] don't cause troubles. command += ')\'' if not parcomp(command, str_to_replace, outputdir, cluster, 1, 1, False): return False if len(glob.glob(os.path.join(sim_plots_dir, os.path.splitext(model)[0] + '*.pdf'))) == 0: return False return True
[docs] @classmethod def generate_report(cls, model, outputdir, sim_plots_folder): """ The third pipeline step: report generation. :param model: the model name :param outputdir: the directory to store the report :param sim_plots_folder: the folder containing the plots :return: True if the task was completed successfully, False otherwise. """ if not os.path.exists(os.path.join(outputdir, sim_plots_folder)): logger.error( "input_dir " + os.path.join(outputdir, sim_plots_folder) + " does not exist. Analyse the data first.") return False logger.info("Generating LaTeX report") filename_prefix = "report__param_estim_" latex_report_pe(outputdir, sim_plots_folder, model, filename_prefix) logger.info("Generating PDF report") pdf_report(outputdir, filename_prefix + model + ".tex") if len(glob.glob(os.path.join(outputdir, '*' + os.path.splitext(model)[0] + '*.pdf'))) == 0: return False return True
[docs] def parse(self, my_dict): __doc__ = Pipeline.parse.__doc__ generate_data = True analyse_data = True generate_report = True generate_tarball = False project_dir = '.' model = 'model' # The simulator simulator = 'Copasi' # The parallel mechanism to use (local | sge | lsf). cluster = 'local' # The number of cpus local_cpus = 1 # The parameter estimation round round = 1 # The number of jobs to be executed runs = 25 # The percent of best fits to consider best_fits_percent = 100 # The number of available data points data_point_num = 10 # Plot 2D correlations using data from 66% confidence levels # This can be very time/memory consuming plot_2d_66cl_corr = False # Plot 2D correlations using data from 95% confidence levels # This can be very time/memory consuming plot_2d_95cl_corr = False # Plot 2D correlations using data from 99% confidence levels # This can be very time/memory consuming plot_2d_99cl_corr = False # True if the parameters should be plotted in log10 space. logspace = True # True if axis labels should be plotted in scientific notation scientific_notation = True # Initialises the variables for key, value in my_dict.items(): logger.info(key + ": " + str(value)) if key == "generate_data": generate_data = value elif key == "analyse_data": analyse_data = value elif key == "generate_report": generate_report = value elif key == "generate_tarball": generate_tarball = value elif key == "project_dir": project_dir = value elif key == "model": model = value elif key == "simulator": simulator = value elif key == "cluster": cluster = value elif key == "round": round = value elif key == "runs": runs = value elif key == "local_cpus": local_cpus = value elif key == "best_fits_percent": best_fits_percent = value elif key == "data_point_num": data_point_num = value elif key == "plot_2d_66cl_corr": plot_2d_66cl_corr = value elif key == "plot_2d_95cl_corr": plot_2d_95cl_corr = value elif key == "plot_2d_99cl_corr": plot_2d_99cl_corr = value elif key == "logspace": logspace = value elif key == "scientific_notation": scientific_notation = value else: logger.warning('Found unknown option: `' + key + '`') return (generate_data, analyse_data, generate_report, generate_tarball, project_dir, simulator, model, cluster, local_cpus, round, runs, best_fits_percent, data_point_num, plot_2d_66cl_corr, plot_2d_95cl_corr, plot_2d_99cl_corr, logspace, scientific_notation)