Source code for dna.dna_bimodality

#!/usr/bin/env python3

"""Module containing the HelParBimodality class and the command line interface."""
import os
import zipfile
from typing import Optional
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture  # type: ignore
from biobb_dna.utils import constants
from biobb_common.generic.biobb_object import BiobbObject
from biobb_common.tools.file_utils import launchlogger
from biobb_dna.utils.loader import load_data


[docs] class HelParBimodality(BiobbObject): """ | biobb_dna HelParBimodality | Determine binormality/bimodality from a helical parameter series dataset. | Determine binormality/bimodality from a helical parameter series dataset. Args: input_csv_file (str): Path to .csv file with helical parameter series. If `input_zip_file` is passed, this should be just the filename of the .csv file inside .zip. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/dna/series_shift_AT.csv>`_. Accepted formats: csv (edam:format_3752). input_zip_file (str) (Optional): .zip file containing the `input_csv_file` .csv file. File type: input. Accepted formats: zip (edam:format_3987). output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.csv>`_. Accepted formats: csv (edam:format_3752). output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.jpg>`_. Accepted formats: jpg (edam:format_3579). properties (dict): * **helpar_name** (*str*) - (Optional) helical parameter name. * **confidence_level** (*float*) - (5.0) Confidence level for Byes Factor test (in percentage). * **max_iter** (*int*) - (400) Number of maximum iterations for EM algorithm. * **tol** (*float*) - (1e-5) Tolerance value for EM algorithm. * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.1 Examples: This is a use example of how to use the building block from Python:: from biobb_dna.dna.dna_bimodality import dna_bimodality prop = { 'max_iter': 500, } dna_bimodality( input_csv_file='filename.csv', input_zip_file='/path/to/input.zip', output_csv_path='/path/to/output.csv', output_jpg_path='/path/to/output.jpg', properties=prop) Info: * wrapped_software: * name: In house * license: Apache-2.0 * ontology: * name: EDAM * schema: http://edamontology.org/EDAM.owl """ def __init__(self, input_csv_file, output_csv_path, output_jpg_path, input_zip_file=None, properties=None, **kwargs) -> None: properties = properties or {} # Call parent class constructor super().__init__(properties) self.locals_var_dict = locals().copy() # Input/Output files self.io_dict = { 'in': { 'input_csv_file': input_csv_file, 'input_zip_file': input_zip_file }, 'out': { 'output_csv_path': output_csv_path, 'output_jpg_path': output_jpg_path } } # Properties specific for BB self.confidence_level = properties.get( "confidence_level", 5.0) self.max_iter = properties.get( "max_iter", 400) self.tol = properties.get( "tol", 1e-5) self.helpar_name = properties.get("helpar_name", None) self.properties = properties # Check the properties self.check_properties(properties) self.check_arguments()
[docs] @launchlogger def launch(self) -> int: """Execute the :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` object.""" # Setup Biobb if self.check_restart(): return 0 self.stage_files() # get helical parameter from filename if not specified if self.helpar_name is None: for hp in constants.helical_parameters: if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None: condition_2 = ( hp.lower() in Path(self.stage_io_dict['in']['input_zip_file']).name.lower()) else: condition_2 = False condition_1 = hp.lower() in Path( self.stage_io_dict['in']['input_csv_file']).name.lower() if (condition_1 or condition_2): self.helpar_name = hp if self.helpar_name is None: raise ValueError( "Helical parameter name can't be inferred from file, " "so it must be specified!") else: if self.helpar_name not in constants.helical_parameters: raise ValueError( "Helical parameter name is invalid! " f"Options: {constants.helical_parameters}") # get unit from helical parameter name if self.helpar_name in constants.hp_angular: self.hp_unit = "Degrees" else: self.hp_unit = "Angstroms" # resolve output output_csv_path = Path( self.stage_io_dict['out']['output_csv_path']).resolve() output_jpg_path = Path( self.stage_io_dict['out']['output_jpg_path']).resolve() # change directory to temporary folder original_directory = os.getcwd() os.chdir(self.stage_io_dict.get("unique_dir", "")) # read input if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None: # if zipfile is specified, extract to temporary folder with zipfile.ZipFile( self.stage_io_dict['in']['input_zip_file'], 'r') as zip_ref: zip_ref.extractall(self.stage_io_dict.get("unique_dir", "")) data = load_data(Path(self.stage_io_dict['in']['input_csv_file']).name) means, variances, bics, weights = self.fit_to_model(data) uninormal, binormal, insuf_ev = self.bayes_factor_criteria( bics[0], bics[1]) if binormal: maxm = np.argmax(means[1]) minm = np.argmin(means[1]) mean1 = means[1][minm] var1 = variances[1][minm] w1 = weights[1][minm] mean2 = means[1][maxm] var2 = variances[1][maxm] w2 = weights[1][maxm] bimodal = self.helguero_theorem(mean1, mean2, var1, var2) else: mean1 = means[0][0] var1 = variances[0][0] w1 = weights[0][0] mean2, var2, w2 = np.nan, np.nan, 0 bimodal = False info = dict( binormal=binormal, uninormal=uninormal, insuf_ev=insuf_ev, bimodal=bimodal, mean1=mean1, mean2=mean2, var1=var1, var2=var2, w1=w1, w2=w2) # save tables pd.DataFrame(info, index=data.columns).to_csv(output_csv_path) # make and save plot data_size = len(data) synth1 = np.random.normal( loc=info['mean1'], scale=np.sqrt(info['var1']), size=int(data_size * info['w1'])) synth2 = np.random.normal( loc=info['mean2'], scale=np.sqrt(info['var2']), size=int(data_size * info['w2'])) plt.figure() alpha = 0.7 bins = 100 if binormal: label1 = "Low State" else: label1 = "Single State" out = plt.hist( synth1, bins=bins, alpha=alpha, density=True, label=label1) ylim = max(out[0]) # type: ignore plt.vlines(info['mean1'], 0, ylim, colors="r", linestyles="dashed") if binormal: out = plt.hist( synth2, bins=bins, alpha=alpha, density=True, label="high state") ylim = max(out[0]) # type: ignore plt.vlines(info['mean2'], 0, ylim, colors="r", linestyles="dashed") plt.legend() plt.ylabel("Density") plt.xlabel(f"{self.helpar_name.capitalize()} ({self.hp_unit})") plt.title(f"Distribution of {self.helpar_name} states") plt.savefig(output_jpg_path, format="jpg") plt.close() # change back to original directory os.chdir(original_directory) # Copy files to host self.copy_to_host() # Remove temporary file(s) self.remove_tmp_files() self.check_arguments(output_files_created=True, raise_exception=False) return 0
[docs] def fit_to_model(self, data): """ Fit data to Gaussian Mixture models. Return dictionary with distribution data. """ means = [] variances = [] bics = [] weights = [] for n_components in (1, 2): gmm = GaussianMixture( n_components=n_components, max_iter=self.max_iter, tol=self.tol) gmm = gmm.fit(data) m = gmm.means_.flatten() # type: ignore v = gmm.covariances_.flatten() # type: ignore b = gmm.bic(data) w = gmm.weights_.flatten() # type: ignore means.append(m) variances.append(v) bics.append(b) weights.append(w) return means, variances, bics, weights
[docs] def bayes_factor_criteria(self, bic1, bic2): diff_bic = bic2 - bic1 # probability of a two-component model p = 1 / (1 + np.exp(0.5*diff_bic)) if p == np.nan: if bic1 == np.nan: p = 1 elif bic2 == np.nan: p = 0 uninormal = p < (self.confidence_level / 100) binormal = p > (1 - (self.confidence_level / 100)) insuf_ev = True if (not uninormal and not binormal) else False return uninormal, binormal, insuf_ev
[docs] def helguero_theorem(self, mean1, mean2, var1, var2): r = var1 / var2 separation_factor = np.sqrt( -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5 ) / ( np.sqrt(r)*(1+np.sqrt(r)) ) bimodal = abs(mean2-mean1) > separation_factor * \ (np.sqrt(var1) + np.sqrt(var2)) return bimodal
[docs] def dna_bimodality( input_csv_file, output_csv_path, output_jpg_path, input_zip_file: Optional[str] = None, properties: Optional[dict] = None, **kwargs) -> int: """Create :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` class and execute the :meth:`launch() <dna.dna_bimodality.HelParBimodality.launch>` method.""" return HelParBimodality(**dict(locals())).launch()
dna_bimodality.__doc__ = HelParBimodality.__doc__ main = HelParBimodality.get_main(dna_bimodality, "Determine binormality/bimodality from a helical parameter dataset.") if __name__ == '__main__': main()