Source code for omniplate.Omniplate

# !/usr/bin/env python
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
import re
from pathlib import Path
import gaussianprocessderivatives as gp
import om_code.omgenutils as gu
from om_code.omfitderiv import fitderiv
import om_code.omplot as omplot
import om_code.loadplatedata as loadplatedata
import om_code.omerrors as errors
import om_code.corrections as corrections
import om_code.sunder as sunder
import om_code.omstats as omstats
import om_code.admin as admin
import om_code.clogger as clogger

version = "0.9.42"

plt.rcParams["figure.max_open_warning"] = 0
sns.set()


[docs]class platereader:
    """
    for analyzing plate-reader data, correcting for autofluorescence, and
    determining growth rates.

    All data is stored used Panda's dataframes and plotted using Seaborn.

    Three dataframes are created. If p is an instance of the platereader class,
    then p.r contains the raw data for each well in the plate; p.s contains the
    processed time-series using the data from all relevant wells; and p.sc
    constains any summary statistics, such as 'max gr'.

    For time series sampled from a Gaussian process, the mean is used as the
    statistics and errors are estimated by the standard deviation.
    For statistics calculated from time series, the median is used and errors
    are estimated by half the interquartile range, with the distribution of
    the statistic calculated by sampling time series.

    Examples
    -------
    A typical work flow is:

    >>> import omniplate as om

    then either

    >>> p= om.platereader('GALdata.xlsx', 'GALcontents.xlsx',
    ...                    wdir= 'data/')

    or

    >>> p= om.platereader()
    >>> p.load('GALdata.xls', 'GALcontents.xlsx')

    and to analyse OD data

    >>> p.plot('OD', plate= True)
    >>> p.correctOD()
    >>> p.correctmedia()
    >>> p.plot(y= 'OD')
    >>> p.plot(y= 'OD', hue= 'strain',
    ...        conditionincludes= ['Gal', 'Glu'],
    ...        strainexcludes= 'HXT7')
    >>> p.getstats('OD')

    and for fluorescence data

    >>> p.correctauto(['GFP', 'AutoFL'])
    >>> p.plot(y= 'c-GFPperOD', hue= 'condition')

    and to save the results

    >>> p.savefigs()
    >>> p.exportdf()

    General properties of the data and of previous processing are shown with:

    >>> p.info
    >>> p.attributes()
    >>> p.corrections()
    >>> p.log

    See also

        http://swainlab.bio.ed.ac.uk/software/omniplate/index.html

    for a tutorial, which can be opened directly using

    >>> p.webhelp()
    """

    # ratio of fluorescence emission at 585nm to emiisions at 525nm for eGFP
    _gamma = 0.114

    ###
[docs]    def __init__(
        self,
        dnames=False,
        anames=False,
        wdir=".",
        platereadertype="Tecan",
        dsheetnumbers=False,
        asheetnumbers=False,
        ODfname=None,
        info=True,
        ls=True,
    ):
        """
        Initiate and potentially immediately load data for processing.

        Parameters
        ----------
        dnames: string or list of strings, optional
            The name of the file containing the data from the plate reader or
            a list of file names.
        anames: string or list of strings, optional
            The name of file containing the corresponding annotation or a list
            of file names.
        wdir: string, optional
            The working directory where the data files are stored and where
            output will be saved.
        platereadertype: string
            The type of plate reader, currently either 'Tecan' or 'Sunrise' or
            'old Tecan'.
        dsheetnumbers: integer or list of integers, optional
            The relevant sheets of the Excel files storing the data.
        asheetnumbers: integer or list of integers, optional
            The relevant sheets of the corresponding Excel files for the
            annotation.
        ODfname: string, optional
            The name of the file with the dilution data used to correct OD for
            its non-linear dependence on numbers of cells. If unspecified, data
            for haploid budding yeast growing in glucose is used.
        info: boolean
            If True (default), display summary information on the data once
            loaded.
        ls: boolean
            If True (default), display contents of working directory.
        """
        self.__version__ = version
        print("\nomniplate version=", self.__version__)
        # absolute path
        self.wdirpath = Path(wdir)

        # enable logging
        # self._initialiselogging()
        # self._logmethod(self.logger)
        self.logger, self.logstream = clogger.initlog(version)

        if True:
            # warning generated occasionally when sampling from the Gaussian
            # process likely because of numerical errors
            warnings.simplefilter("ignore", RuntimeWarning)

        # dictionary recording extent of analysis
        self.progress = {
            "ignoredwells": {},
            "negativevalues": {},
            "ODfname": {},
            "gc": {},
        }
        self.allexperiments = []
        self.allconditions = {}
        self.allstrains = {}
        self.datatypes = {}

        if dnames is False:
            # list all files in current directory
            if ls:
                self.ls
        else:
            # immediately load data
            self.load(
                dnames,
                anames,
                platereadertype,
                dsheetnumbers,
                asheetnumbers,
                ODfname,
                info,
            )

    ###

    def __repr__(self):
        repstr = "{} v{}: ".format(self.__class__.__name__, self.__version__)
        for e in self.allexperiments:
            repstr += e + " ; "
        if repstr[-2:] == "; ":
            repstr = repstr[:-3]
        return repstr

    ###
    @property
    def ls(self):
        """
        List all files in the working directory.
        A dictionary of available datasets is created as a shortcut.

        Example
        -------
        >>> p.ls
        >>> p.ds
        >>> p.load(p.ds[0], p.ds[1])
        """
        self.ds = {}
        print("Working directory is", str(self.wdirpath.resolve()))
        print("Files available are:", "\n---")
        files = []
        for f in self.wdirpath.glob("*.*"):
            if f.is_file() and (
                f.suffix == ".xlsx"
                or f.suffix == ".json"
                or f.suffix == ".tsv"
                or f.suffix == ".csv"
                or f.suffix == ".xls"
            ):
                print(f.stem + f.suffix)
                files.append(f.stem + f.suffix)
        print()
        self.ds = {i: f for i, f in enumerate(sorted(files))}

    ###

[docs]    def changewdir(self, wdir):
        """
        Change working directory.

        Parameters
        ----------
        wdir: string
            The new working directory specified from the current directory.

        Example
        -------
        >>> p.changewdir('newdata/')
        """
        self.wdirpath = Path(wdir)
        self.ls

    ###

[docs]    @clogger.log
    def load(
        self,
        dnames,
        anames=False,
        platereadertype="Tecan",
        dsheetnumbers=False,
        asheetnumbers=False,
        ODfname=None,
        info=True,
    ):
        """
        Loads raw data files generated by the plate reader and the
        corresponding annotation files.

        Parameters
        ----------
        dnames: string or list of strings, optional
            The name of the file containing the data from the plate reader
            or a list of file names.
        anames: string or list of strings, optional
            The name of file containing the corresponding annotation or a list
            of file names.
        platereadertype: string
            The type of plate reader, currently either 'Tecan' or 'Sunrise' or
            'old Tecan'.
        dsheetnumbers: integer or list of integers, optional
            The relevant sheets of the Excel files storing the data.
        asheetnumbers: integer or list of integers, optional
            The relevant sheets of the corresponding Excel files for the
            annotation.
        ODfname: string, optional
            The name of the file with the dilution data used to correct OD for
            its non-linear dependence on numbers of cells. If unspecified, data
            for haploid budding yeast growing in glucose is used.
        info: boolean
            If True (default), display summary information on the data once
            loaded.

        Examples
        -------
        >>> p.load('Data.xlsx', 'DataContents.xlsx')
        >>> p.load('Data.xlsx', 'DataContents.xlsx', info= False)
        >>> p.load('Data.xlsx', 'DataContents.xlsx',
        ...         ODfname= 'ODcorrection_Glucose_Diploid.txt')
        """
        dnames = gu.makelist(dnames)
        if not anames:
            anames = [
                dname.split(".")[0] + "Contents.xlsx" for dname in dnames
            ]
        else:
            anames = gu.makelist(anames)
        if not dsheetnumbers:
            dsheetnumbers = [0 for dname in dnames]
        if not asheetnumbers:
            asheetnumbers = [0 for dname in dnames]

        alldata = {}
        for i, dname in enumerate(dnames):
            # get dataframe for raw data
            (
                rdf,
                allconditionssingle,
                allstrainssingle,
                alldatasingle,
                experiment,
                datatypes,
            ) = loadplatedata.loadfromplate(
                platereadertype,
                self.wdirpath,
                dname,
                dsheetnumbers[i],
                anames[i],
                asheetnumbers[i],
            )
            self.allexperiments.append(experiment)
            self.allconditions[experiment] = allconditionssingle
            self.allstrains[experiment] = allstrainssingle
            self.datatypes[experiment] = datatypes
            alldata.update(alldatasingle)
            self.r = (
                pd.merge(self.r, rdf, how="outer")
                if hasattr(self, "r")
                else rdf
            )
            # update progress dictionary
            admin.initialiseprogress(self, experiment)

        # define ODfname in progress dictionary
        if ODfname:
            if isinstance(ODfname, str):
                self.progress["ODfname"] = {
                    exp: ODfname for exp in self.allexperiments
                }
            else:
                self.progress["ODfname"] = {
                    exp: ODfname[i]
                    for i, exp in enumerate(self.allexperiments)
                }
        else:
            self.progress["ODfname"] = {
                exp: None for exp in self.allexperiments
            }

        # dataframe for summary stats and corrections
        alldfs = []
        # for exp in self.allexperiments:
        for exp in alldata:
            strs, cons = [], []
            for cs in alldata[exp]:
                strs.append(cs.split(" in ")[0])
                cons.append(cs.split(" in ")[1])
            corrdict = {
                "experiment": exp,
                "strain": strs,
                "condition": cons,
                "OD corrected": False,
            }
            corrdict.update(
                {
                    dtype + " corrected for media": False
                    for dtype in self.datatypes[exp]
                }
            )
            corrdict.update(
                {
                    dtype + " corrected for autofluorescence": False
                    for dtype in self.datatypes[exp]
                    if dtype not in ["AutoFL", "OD"]
                }
            )
            alldfs.append(pd.DataFrame(corrdict))
        self.sc = pd.concat(alldfs)

        # dataframe of original data
        self.origr = self.r.copy()
        # dataframe for well content
        self.wellsdf = admin.makewellsdf(self.r)
        # dataframe for summary data
        self.s = admin.make_s(self)

        # display info on experiment, conditions and strains
        if info:
            self.info
        print(
            '\nWarning: wells with no strains have been changed to "Null"'
            "\nto avoid confusion with pandas.\n"
        )

    ###
    # Routines to display information on data and state of data processing
    ###
    @property
    def info(self):
        """
        Displays conditions, strains, and datatypes.

        Example
        -------
        >>> p.info
        """
        for exp in self.allexperiments:
            print("\nExperiment:", exp, "\n---")
            print("Conditions:")
            for c in sorted(self.allconditions[exp], key=gu.natural_keys):
                print("\t", c)
            print("Strains:")
            for s in sorted(self.allstrains[exp], key=gu.natural_keys):
                print("\t", s)
            print("Data types:")
            for d in self.datatypes[exp]:
                print("\t", d)
            if self.progress["ignoredwells"]:
                print("Ignored wells:")
                if self.progress["ignoredwells"][exp]:
                    for d in self.progress["ignoredwells"][exp]:
                        print("\t", d)
                else:
                    print("\t", "None")
        print()

    ###

[docs]    def webhelp(self, browser=None):
        """
        Opens detailed examples of how to use in omniplate in a web browser.

        Parameters
        ----------
        browser: string, optional
            The browser to use - either the default if unspecified or 'firefox',
            'chrome', etc.

        Example
        -------
        >>> p.webhelp()
        """
        import webbrowser

        url = "https://swainlab.bio.ed.ac.uk/software/omniplate/index.html"
        webbrowser.get(browser).open_new(url)

    ###
    @property
    def attributes(self):
        """
        Displays the names of the attributes of the current instance of
        platereader and acts as a check to see what variables have been
        calculated or determined.

        Example
        -------
        >>> p.attributes
        """
        ignore = [
            "d",
            "consist",
            "t",
            "nosamples",
            "_gamma",
            "ODfname",
            "overflow",
            "nooutchannels",
            "nodata",
            "__doc__",
        ]
        for a in self.__dict__:
            if (
                "corrected" not in a
                and "processed" not in a
                and a not in ignore
            ):
                print(a)

    ###

[docs]    @clogger.log
    def rename(self, translatedict):
        """
        Uses a dictionary to replace all occurrences of a strain or a condition
        with an alternative.
        Note that instances of self.progress will not be updated.

        Parameters
        ----------
        translatedict: dictionary
            A dictionary of old name - new name pairs

        Example
        -------
        >>> p.rename({'77.WT' : 'WT', '409.Hxt4' : 'Hxt4'})
        """
        # replace in dataframes
        for df in [self.r, self.s, self.sc]:
            df.replace(translatedict, inplace=True)
        # rename in attributes

        def applydict(a):
            return translatedict[a] if a in translatedict else a

        for e in self.allexperiments:
            self.allconditions[e] = list(map(applydict, self.allconditions[e]))
            self.allstrains[e] = list(map(applydict, self.allstrains[e]))

    ###

[docs]    def corrections(
        self,
        experiments="all",
        conditions="all",
        strains="all",
        experimentincludes=False,
        experimentexcludes=False,
        conditionincludes=False,
        conditionexcludes=False,
        strainincludes=False,
        strainexcludes=False,
    ):
        """
        Displays the status of corrections made for the specified strains,
        conditions, and experiments.

        Parameters
        ----------
        experiments: string or list of strings
            The experiments to include.
        conditions: string or list of strings
            The conditions to include.
        strains: string or list of strings
            The strains to include.
        experimentincludes: string, optional
            Selects only experiments that include the specified string in their
            name.
        experimentexcludes: string, optional
            Ignores experiments that include the specified string in their
            name.
        conditionincludes: string, optional
            Selects only conditions that include the specified string in their
            name.
        conditionexcludes: string, optional
            Ignores conditions that include the specified string in their name.
        strainincludes: string, optional
            Selects only strains that include the specified string in their
            name.
        strainexcludes: string, optional
            Ignores strains that include the specified string in their name.

        Returns
        -------
        df: dataframe
            Contains the status of the corrections for the specified strains,
            conditions, and experiments.

        Examples
        --------
        >>> p.corrections()
        >>> p.corrections(strainincludes= 'GAL')
        """
        exps, cons, strs = sunder.getall(
            self,
            experiments,
            experimentincludes,
            experimentexcludes,
            conditions,
            conditionincludes,
            conditionexcludes,
            strains,
            strainincludes,
            strainexcludes,
        )
        df = self.sc.query(
            "experiment == @exps and condition == @cons and strain == @strs"
        )
        # only show corrections and not stats
        df = df[
            ["experiment", "strain", "condition"]
            + [col for col in df.columns if "correct" in col]
        ]
        df = df.T
        return df

    ###

[docs]    @clogger.log
    def addcolumn(self, newcolumnname, oldcolumn, newcolumnvalues):
        """
        Adds a new column to all dataframes by parsing an existing column.
        All possible entries for the new column are specified as strings and
        the entry in the new column will be whichever of these strings is
        present in the entry of the existing column.

        Parameters
        ----------
        newcolumnname: string
            The name of the new column.
        oldcolumn: string
            The name of the column to be parsed to create the new column.
        newcolumnvalues: list of strings
            All of the possible values for the entries in the new column.

        Example
        -------
        >>> p.addcolumn('medium', 'condition', ['Raffinose',
        ...                                     'Geneticin'])

        will parse each entry in 'condition' to create a new column called
        'medium' that has either a value 'Raffinose' if 'Raffinose' is in the
        entry from 'condition' or a value 'Geneticin' if 'Geneticin' is in the
        entry from 'condition'.
        """
        for df in [self.r, self.s, self.sc]:
            newcol = np.array(
                ("",) * len(df[oldcolumn].to_numpy()), dtype="object"
            )
            for i, oldcolvalue in enumerate(df[oldcolumn].to_numpy()):
                for newcolvalue in newcolumnvalues:
                    if newcolvalue in oldcolvalue:
                        newcol[i] = newcolvalue
            df[newcolumnname] = newcol

    ###

[docs]    @clogger.log
    def addnumericcolumn(
        self,
        newcolumnname,
        oldcolumn,
        picknumber=0,
        leftsplitstr=None,
        rightsplitstr=None,
        asstr=False,
    ):
        """
        Adds a new numeric column by parsing the numbers from the entries of
        an existing column.
        It is best to run this command only after the basic analyses -
        ignorewells, correctOD, and correctmedia - have been performed because
        it changes the structure of the dataframes and may cause errors.


        Parameters
        ----------
        newcolumnname: string
            The name of the new column.
        oldcolumn: string
            The name of column to be parsed.
        picknumber: integer
            The number to pick from the list of numbers extracted from the
            existing column's entry.
        leftsplitstr: string, optional
            Split the entry of the column using whitespace and parse numbers
            from the substring to the immediate left of leftsplitstr rather
            than the whole entry.
        rightsplitstr: string, optional
            Split the entry of the column using whitespace and parse numbers
            from the substring to the immediate right of rightsplitstr rather
            than the whole entry.
        asstr: boolean
            If True, convert the numeric value to a string to improve plots
            with seaborn.

        Examples
        --------
        To extract concentrations from conditions use

        >>> p.addnumericcolumn('concentration', 'condition')

        For a condition like '0.5% Raf 0.05ug/mL Cycloheximide', use

        >>> p.addnumericcolumn('raffinose', 'condition',
        ...                     picknumber= 0)
        >>> p.addnumericcolumn('cycloheximide', 'condition',
        ...                     picknumber= 1)
        """
        # process splitstrs
        if leftsplitstr or rightsplitstr:
            splitstr = leftsplitstr if leftsplitstr else rightsplitstr
            locno = -1 if leftsplitstr else 1
        else:
            splitstr = False
        # change each dataframe
        for df in [self.r, self.s, self.sc]:
            if asstr:
                # new column of strings
                newcol = np.full_like(
                    df[oldcolumn].to_numpy(), "", dtype="object"
                )
            else:
                # new column of floats
                newcol = np.full_like(
                    df[oldcolumn].to_numpy(), np.nan, dtype="float"
                )
            # parse old column
            for i, oldcolvalue in enumerate(df[oldcolumn].to_numpy()):
                if oldcolvalue:
                    # split string first on spaces and then find substring
                    # adjacent to specified splitstring
                    if splitstr:
                        if splitstr in oldcolvalue:
                            # oldcolvalue contains leftsplitstring or
                            # rightsplitstring
                            bits = oldcolvalue.split()
                            for k, bit in enumerate(bits):
                                if splitstr in bit:
                                    loc = k + locno
                                    break
                            # adjacent string
                            oldcolvalue = bits[loc]
                        else:
                            # oldcolvalue does not contain leftsplitstring
                            # or rightsplitstring
                            oldcolvalue = ""
                    # loop through all floats in oldcolvalue
                    nocount = 0
                    for ci in re.split(
                        r"[+-]?([0-9]+(?:[.][0-9]*)?|[.][0-9]+)", oldcolvalue
                    ):
                        try:
                            no = float(ci)
                            if nocount == picknumber:
                                newcol[i] = ci if asstr else no
                                break
                            nocount += 1
                        except ValueError:
                            pass
            df[newcolumnname] = newcol

    ###

[docs]    @clogger.log
    def add_to_sc(
        self,
        newcolumn=None,
        s_column=None,
        func=None,
        experiments="all",
        experimentincludes=False,
        experimentexcludes=False,
        conditions="all",
        conditionincludes=False,
        conditionexcludes=False,
        strains="all",
        strainincludes=False,
        strainexcludes=False,
    ):
        """
        Applies func to a column in the s dataframe and
        stores the results in the sc dataframe.

        Parameters
        ----------
        newcolumn:  string
            The name of the new column in the sc dataframe
        s_column:   string
            The name of the column in s dataframe from which the
            data is to be processed
        func:   function
            The function to be applied to the data in the s dataframe.

        Examples
        --------

        >>> p.add_to_sc(newcolumn= "max GFP", s_column= "GFP mean",
        ...             func= np.nanmax)
        >>> p.add_to_sc(newcolumn= "GFP lower quartile", s_column= "GFP mean",
        ...             func= lambda x: np.nanquantile(x, 0.25))
        """
        # extract data
        exps, cons, strs = sunder.getall(
            self,
            experiments,
            experimentincludes,
            experimentexcludes,
            conditions,
            conditionincludes,
            conditionexcludes,
            strains,
            strainincludes,
            strainexcludes,
        )
        self.sc[newcolumn] = np.nan
        for e in exps:
            for c in cons:
                for s in strs:
                    d = self.s.query(
                        "experiment == @e and condition == @c and "
                        "strain == @s"
                    )[s_column].values
                    res = np.asarray(func(d))
                    if res.size == 1:
                        self.sc.loc[
                            (self.sc.experiment == e)
                            & (self.sc.condition == c)
                            & (self.sc.strain == s),
                            newcolumn,
                        ] = func(d)
                    else:
                        print("func must return a single value")
                        return False

    ###

[docs]    @clogger.log
    def addcommonvar(
        self,
        var="time",
        dvar=None,
        varmin=None,
        varmax=None,
        figs=True,
        experiments="all",
        experimentincludes=False,
        experimentexcludes=False,
        conditions="all",
        conditionincludes=False,
        conditionexcludes=False,
        strains="all",
        strainincludes=False,
        strainexcludes=False,
    ):
        """
        Adds to time-dependent dataframes a common variable whose values only
        come from a fixed array so that they are from the same array for all
        experiments. This common variable allows averaging across experiments
        and typically is time.

        For example, the plate reader often does not perfectly increment time
        between measurements and different experients can have slightly
        different time points despite the plate reader having the same
        settings. These unique times prevent seaborn from taking averages.

        If experiments have measurements that start at the same time point and
        have the same interval between measurements, then setting a commontime
        for all experiments will allow seaborn to perform averaging.

        The array of the common variable runs from varmin to varmax with an
        interval dvar. These parameters are automatically calculated, but may
        be specified.

        Each instance of var is assigned a common value - the closest instance
        of the common variable to the instance of var. Measurements are assumed
        to the same for the true instance of var and for the assigned common
        value, which may generate errors if these two are sufficiently
        distinct.

        An alternative method is averageoverexpts.

        Parameters
        ----------
        var: string
            The variable from which the common variable is generated,
            typically 'time'.
        dvar: float, optional
            The interval between the values comprising the common array.
        varmin: float, optional
            The minimum of the common variable.
        varmax: float, optional
            The maximum of the common variable.
        figs: boolean
            If True, generate plot to check if the variable and the common
            variable generated from it are sufficiently close in value.
        experiments: string or list of strings
            The experiments to include.
        conditions: string or list of strings
            The conditions to include.
        strains: string or list of strings
            The strains to include.
        experimentincludes: string, optional
            Selects only experiments that include the specified string in their
            name.
        experimentexcludes: string, optional
            Ignores experiments that include the specified string in their
            name.
        conditionincludes: string, optional
            Selects only conditions that include the specified string in their
            name.
        conditionexcludes: string, optional
            Ignores conditions that include the specified string in their name.
        strainincludes: string, optional
            Selects only strains that include the specified string in their
            name.
        strainexcludes: string, optional
            Ignores strains that include the specified string in their name.

        Example
        -------
        To plot averages of time-dependent variables over experiments, use for
        example

        >>> p.addcommonvar('time')
        >>> p.plot(x= 'commontime', y= 'c-GFPperOD',
        ...        hue= 'condition', style= 'strain')
        """
        exps, cons, strs = sunder.getall(
            self,
            experiments,
            experimentincludes,
            experimentexcludes,
            conditions,
            conditionincludes,
            conditionexcludes,
            strains,
            strainincludes,
            strainexcludes,
            nonull=True,
        )
        print("Finding common" + var)
        for df in [self.r, self.s]:
            if var in df:
                loc = (
                    df.experiment.isin(exps)
                    & df.condition.isin(cons)
                    & df.strain.isin(strs)
                )
                print("r dataframe") if df.equals(self.r) else print(
                    "s dataframe"
                )
                if dvar is None:
                    # calculated for tidy printing
                    elen = np.max([len(e) for e in exps]) + 5
                    # find median increment in var
                    for e in exps:
                        evar = df[loc][var].to_numpy()
                        print(
                            " {:{}} {}_min= {:.2e} ; d{}= {:.2e}".format(
                                e,
                                elen,
                                var,
                                np.min(evar),
                                var,
                                np.median(np.diff(evar)),
                            )
                        )
                    ldvar = np.median(np.diff(df[loc][var].to_numpy()))
                else:
                    ldvar = dvar
                print(" Using d{}= {:.2e}".format(var, ldvar))
                lvarmin = df[loc][var].min() if varmin is None else varmin
                print(" Using {}_min= {:.2e}\n".format(var, lvarmin))
                lvarmax = df[loc][var].max() if varmax is None else varmax
                # define common var
                cvar = np.arange(lvarmin, lvarmax, ldvar)
                df.loc[loc, "common" + var] = df[loc][var].apply(
                    lambda x: cvar[np.argmin((x - cvar) ** 2)]
                )
                if figs:
                    plt.figure()
                    sl = np.linspace(
                        df[loc][var].min(), 1.05 * df[loc][var].max(), 100
                    )
                    plt.plot(sl, sl, alpha=0.4)
                    plt.plot(
                        df[loc][var].to_numpy(),
                        df[loc]["common" + var].to_numpy(),
                        ".",
                    )
                    plt.xlabel(var)
                    plt.ylabel("common" + var)
                    title = (
                        "r dataframe" if df.equals(self.r) else "s dataframe"
                    )
                    plt.title(title)
                    plt.suptitle(
                        "comparing "
                        + var
                        + " with common"
                        + var
                        + " – the line y= x is expected"
                    )
                    plt.tight_layout()
                    plt.show()

    ###
    # Routines to examine and ignore individual wells
    ###

[docs]    def contentsofwells(self, wlist):
        """
        Displays contents of wells

        Parameters
        ----------
        wlist: string or list of string
            Specifies the well or wells of interest.

        Examples
        --------
        >>> p.contentsofwells(['A1', 'E4'])
        """
        wlist = gu.makelist(wlist)
        for w in wlist:
            print("\n" + w + "\n--")
            print(
                self.wellsdf.query("well == @w")
                .drop(["well"], axis=1)
                .to_string(index=False)
            )

    ###

[docs]    def showwells(
        self,
        concise=False,
        sortby=False,
        experiments="all",
        conditions="all",
        strains="all",
        experimentincludes=False,
        experimentexcludes=False,
        conditionincludes=False,
        conditionexcludes=False,
        strainincludes=False,
        strainexcludes=False,
    ):
        """
        Displays wells for specified experiments, conditions, and strains.

        Parameters
        ----------
        concise: boolean
            If True, display as experiment: condition: strain.
        sortby: list of strings, optional
            List of column names on which to sort the results.
        experiments: string or list of strings
            The experiments to include.
        conditions: string or list of strings
            The conditions to include.
        strains: string or list of strings
            The strains to include.
        experimentincludes: string, optional
            Selects only experiments that include the specified string in their
            name.
        experimentexcludes: string, optional
            Ignores experiments that include the specified string in their
            name.
        conditionincludes: string, optional
            Selects only conditions that include the specified string in their
            name.
        conditionexcludes: string, optional
            Ignores conditions that include the specified string in their name.
        strainincludes: string, optional
            Selects only strains that include the specified string in their
            name.
        strainexcludes: string, optional
            Ignores strains that include the specified string in their name.

        Examples
        --------
        >>> p.showwells()
        >>> p.showwells(strains= 'Mal12:GFP', conditions= '1% Mal')
        """
        exps, cons, strs = sunder.getall(
            self,
            experiments,
            experimentincludes,
            experimentexcludes,
            conditions,
            conditionincludes,
            conditionexcludes,
            strains,
            strainincludes,
            strainexcludes,
            nonull=False,
        )
        if not hasattr(self, "wellsdf"):
            self.wellsdf = admin.makewellsdf(self.r)
        df = self.wellsdf.query(
            "experiment == @exps and condition == @cons and strain == @strs"
        )
        if sortby:
            df = df.sort_values(by=gu.makelist(sortby))
        print()
        for e in exps:
            if concise:
                print(
                    df[["experiment", "condition", "strain"]]
                    .drop_duplicates()
                    .query("experiment == @e")
                    .to_string(index=False)
                )
            else:
                print(df.query("experiment == @e").to_string(index=False))
            print()

    ###

[docs]    @clogger.log
    def ignorewells(
        self,
        exclude=[],
        experiments="all",
        experimentincludes=False,
        experimentexcludes=False,
        clearall=False,
    ):
        """
        Allows wells to be ignored in any future processing.
        If called several times, the default behaviour is for any previously
        ignored wells not to be re-instated.

        Parameters
        ---------
        exclude: list of strings
            List of labels of wells on the plate to be excluded.
        experiments: string or list of strings
            The experiments to include.
        experimentincludes: string, optional
            Selects only experiments that include the specified string in their
            name.
        experimentexcludes: string, optional
            Ignores experiments that include the specified string in their
            name.
        clearall: boolean
            If True, all previously ignored wells are re-instated.

        Example
        -------
        >>> p.ignorewells(['A1', 'C2'])
        """
        if clearall:
            # forget any previously ignoredwells
            self.r = self.origr.copy()
            self.progress["ignoredwells"] = {
                exp: [] for exp in self.allexperiments
            }
            admin.update_s(self)
            print(
                "Warning: all corrections and analysis to raw data have been"
                " lost. No wells have been ignored."
            )
        else:
            if gu.islistempty(exclude):
                return
            else:
                # exclude should be a list of lists
                if isinstance(exclude, str):
                    exclude = [gu.makelist(exclude)]
                elif isinstance(exclude[0], str):
                    exclude = [exclude]
                # check consistency
                if len(self.allexperiments) == 1:
                    exps = self.allexperiments
                else:
                    exps = sunder.getexps(
                        self,
                        experiments,
                        experimentincludes,
                        experimentexcludes,
                    )
                if len(exclude) != len(exps) and not clearall:
                    raise errors.IgnoreWells(
                        "Either a list of wells to exclude for a particular\n"
                        "experiment or a list of experiments must be given."
                    )
                else:
                    # drop wells
                    for ex, exp in zip(exclude, exps):
                        # wells cannot be ignored twice
                        wex = list(
                            set(ex) - set(self.progress["ignoredwells"][exp])
                        )
                        # drop data from ignored wells
                        df = self.r
                        filt = (df["experiment"] == exp) & df["well"].isin(wex)
                        df = df.loc[~filt]
                        df = df.reset_index(drop=True)
                        self.r = df
                        # store ignoredwells
                        self.progress["ignoredwells"][exp] += ex
                        # remove any duplicates
                        self.progress["ignoredwells"][exp] = list(
                            set(self.progress["ignoredwells"][exp])
                        )
                    anycorrections = np.count_nonzero(
                        self.sc[
                            [
                                col
                                for col in self.sc.columns
                                if "correct" in col
                            ]
                        ].values
                    )
                    if np.any(anycorrections):
                        print(
                            "Warning: you have ignored wells after correcting\n"
                            "the data. It is best to ignorewells first, before\n"
                            "running any analysis."
                        )
                # remake summary data
                admin.update_s(self)

    ###

[docs]    @clogger.log
    def restricttime(self, tmin=None, tmax=None):
        """
        Restrict the processed data to a range of time, ignoring points outside
        this time range.

        Note that data in the .s dataframe outside the time range is lost.
        Exporting the dataframes before running restricttime is recommended.

        Parameters
        ----------
        tmin: float
            The minimum value of time, with data kept only for t >= tmin.
        tmax: float
            The maximum value of time, with data kept only for t <= tmax.

        Example
        -------
        >>> p.restricttime(tmin= 5)
        """
        if tmin is None:
            tmin = self.r.time.min()
        if tmax is None:
            tmax = self.r.time.max()
        if tmax > tmin:
            self.s = self.s[(self.s.time >= tmin) & (self.s.time <= tmax)]
        else:
            print("tmax or tmin is not properly defined")

    ###
    # Routines for plotting
    ###
[docs]    @clogger.log
    def plot(
        self,
        x="time",
        y="OD",
        hue="strain",
        style="condition",
        size=None,
        kind="line",
        col=None,
        row=None,
        height=5,
        aspect=1,
        ymin=None,
        figsize=False,
        returnfacetgrid=False,
        title=None,
        plate=False,
        wells=False,
        nonull=False,
        messages=False,
        sortby=False,
        experiments="all",
        conditions="all",
        strains="all",
        experimentincludes=False,
        experimentexcludes=False,
        conditionincludes=False,
        conditionexcludes=False,
        strainincludes=False,
        strainexcludes=False,
        **kwargs,
    ):
        """
        Plots from the underlying dataframes (chosen automatically) using
        Seaborn's relplot, which is described at
        https://seaborn.pydata.org/generated/seaborn.relplot.html

        Parameters
        ----------
        x: string
            The variable - column of the dataframe - for the x-axis.
        y: string
            The variable - column of the dataframe - for y-axis.
        hue: string
            The variable whose variation will determine the colours of the
            lines plotted. From Seaborn.
        style: string
            The variable whose variation will determine the style of each line.
            From Seaborn.
        size: string
            The variable whose vairation will determine the size of each
            marker. From Seaborn.
        kind: string
            Either 'line' or 'scatter', which determines the type of plot.
            From Seaborn.
        col: string, optional
            The variable that varies over the columns in a multipanel plot.
            From Seaborn.
        row: string, optional
            The variable that varies over the rows in a multipanel plot.
            From Seaborn.
        height: float, optional
            The height of the individual panels in a multipanel plot.
            From Seaborn.
        aspect: float, optional
            The aspect ratio of the individual panels in a multipanel plot.
            From Seaborn.
        ymin: float, optional
            The minimum y-value
        figsize: tuple, optional
            A tuple of (width, height) for the size of figure.
            Ignored if wells= True or plate= True.
        returnfacetgrid: boolean, optional
            If True, return Seaborn's facetgrid object created by relplot
        title: float, optional
            The title of the plot (overwrites any default titles).
        plate: boolean, optional
            If True, data for each well for a whole plate are plotted in one
            figure.
        wells: boolean, optional
            If True, data for the individual wells is shown.
        nonull: boolean, optional
            If True, 'Null' strains are not plotted.
        sortby: list of strings, optional
            A list of columns to sort the data in the dataframe and passed to
            pandas sort_values.
        messsages: boolean, optional
            If True, print warnings for any data requested but not found.
        experiments: string or list of strings
            The experiments to include.
        conditions: string or list of strings
            The conditions to include.
        strains: string or list of strings
            The strains to include.
        experimentincludes: string, optional
            Selects only experiments that include the specified string in
            their name.
        experimentexcludes: string, optional
            Ignores experiments that include the specified string in their
            name.
        conditionincludes: string, optional
            Selects only conditions that include the specified string in their
            name.
        conditionexcludes: string, optional
            Ignores conditions that include the specified string in their name.
        strainincludes: string, optional
            Selects only strains that include the specified string in their
            name.
        strainexcludes: string, optional
            Ignores strains that include the specified string in their name.
        kwargs: for Seaborn's relplot
            https://seaborn.pydata.org/generated/seaborn.relplot.html

        Returns
        -------
        sfig: Seaborn's facetgrid object generated by relplot if
        returnfacetgrid= True

        Examples
        --------
        >>> p.plot(y= 'OD', plate= True)
        >>> p.plot(y= 'OD', wells= True, strainincludes= 'Gal10:GFP')
        >>> p.plot(y= 'OD')
        >>> p.plot(x= 'OD', y= 'gr')
        >>> p.plot(y= 'c-GFPperOD', nonull= True, ymin= 0)
        >>> p.plot(y= 'c-GFPperOD', conditionincludes= '2% Mal',
        ...        hue= 'strain')
        >>> p.plot(y= 'c-mCherryperOD', conditions= ['0.5% Mal',
        ...        '1% Mal'], hue= 'strain', style= 'condition',
        ...         nonull= True, strainincludes= 'mCherry')
        >>> p.plot(y= 'c-GFPperOD', col= 'experiment')
        >>> p.plot(y= 'max gr')
        """
        # choose the correct dataframe
        basedf, dfname = omplot.plotfinddf(self, x, y)
        # get experiments, conditions and strains
        exps, cons, strs = sunder.getall(
            self,
            experiments,
            experimentincludes,
            experimentexcludes,
            conditions,
            conditionincludes,
            conditionexcludes,
            strains,
            strainincludes,
            strainexcludes,
            nonull,
        )
        # choose the right type of plot
        if plate:
            dtype = y if x == "time" else x
            omplot.plotplate(basedf, exps, dtype)
        elif wells:
            omplot.plot_wells(
                x,
                y,
                basedf,
                exps,
                cons,
                strs,
                style,
                size,
                kind,
                col,
                row,
                ymin,
                title,
                messages,
                **kwargs,
            )
        elif dfname == "s" or dfname == "r":
            omplot.plot_rs(
                x,
                y,
                basedf,
                exps,
                cons,
                strs,
                hue,
                style,
                size,
                kind,
                col,
                row,
                height,
                aspect,
                ymin,
                title,
                figsize,
                sortby,
                returnfacetgrid,
                **kwargs,
            )
        elif dfname == "sc":
            omplot.plot_sc(
                x,
                y,
                basedf,
                exps,
                cons,
                strs,
                hue,
                style,
                size,
                kind,
                col,
                row,
                height,
                aspect,
                ymin,
                figsize,
                title,
                sortby,
                **kwargs,
            )
        else:
            raise errors.PlotError("No data found")

    ###

[docs]    def savefigs(self, fname=None, onefile=True):
        """
        Saves all current figures to PDF, either to one file or each to a
        separate file.

        Parameters
        ----------
        fname: string, optional
            Name of file. If unspecified, the name of the experiment is used.
        onefile: boolean, optional
            If False, each figures is save to its own PDF file.

        Example
        -------
        >>> p.savefigs()
        >>> p.savefigs('figures.pdf')
        """
        if fname:
            if ".pdf" not in fname:
                fname += ".pdf"
            fname = str(self.wdirpath / fname)
        else:
            fname = str(
                self.wdirpath / ("".join(self.allexperiments) + ".pdf")
            )
        if onefile:
            gu.figs2pdf(fname)
        else:
            for i in plt.get_fignums():
                plt.figure(i)
                savename = str(plt.getp(plt.gcf(), "axes")[0].title).split(
                    "'"
                )[1]
                savename = savename.replace(" ", "_")
                if savename == "":
                    savename = "Whole_plate_Figure_" + str(i)
                print("Saving", savename)
                plt.savefig(str(self.wdirpath / (savename + ".pdf")))

    ###
    @property
    def close(self):
        """
        Close all figures.

        Example
        -------
        >>> p.close
        """
        plt.close("all")

    ###

[docs]    @clogger.log
    def getdataframe(
        self,
        dfname="s",
        experiments="all",
        conditions="all",
        strains="all",
        experimentincludes=False,
        experimentexcludes=False,
        conditionincludes=False,
        conditionexcludes=False,
        strainincludes=False,
        strainexcludes=False,
        nonull=True,
    ):
        """
        Obtain a subset of the data in a dataframe, which can be used plotting
        directly.

        Parameters
        ---------
        dfname: string
            The dataframe of interest either 'r' (raw data),
            's' (default; processed data),
            or 'sc' (summary statistics).
        experiments: string or list of strings
            The experiments to include.
        conditions: string or list of strings
            The conditions to include.
        strains: string or list of strings
            The strains to include.
        experimentincludes: string, optional
            Selects only experiments that include the specified string in their
            name.
        experimentexcludes: string, optional
            Ignores experiments that include the specified string in their
            name.
        conditionincludes: string, optional
            Selects only conditions that include the specified string in their
            name.
        conditionexcludes: string, optional
            Ignores conditions that include the specified string in their name.
        strainincludes: string, optional
            Selects only strains that include the specified string in their
            name.
        strainexcludes: string, optional
            Ignores strains that include the specified string in their name.
        nonull: boolean, optional
            If True, ignore 'Null' strains

        Returns
        -------
        ndf: dataframe

        Example
        -------
        >>> ndf= p.getdataframe('s', conditions= ['2% Glu'],
        ...                     nonull= True)
        """
        exps, cons, strs = sunder.getall(
            self,
            experiments,
            experimentincludes,
            experimentexcludes,
            conditions,
            conditionincludes,
            conditionexcludes,
            strains,
            strainincludes,
            strainexcludes,
            nonull,
        )
        if hasattr(self, dfname):
            df = getattr(self, dfname)
            ndf = df.query(
                "experiment == @exps and condition == @cons "
                "and strain == @strs"
            )
            if ndf.empty:
                print("No data found")
            else:
                return ndf.copy()
        else:
            raise errors.UnknownDataFrame(
                "Dataframe " + dfname + " is not recognised"
            )

    ###
    # OD correction
    ###
[docs]    @clogger.log
    def correctOD(
        self,
        figs=True,
        odmatch=0.3,
        experiments="all",
        experimentincludes=False,
        experimentexcludes=False,
        conditions="all",
        conditionincludes=False,
        conditionexcludes=False,
    ):
        """
        Corrects OD data for the non-linear relationship between OD and cell
        number.
        Requires a set of dilution data set, with the default being haploid
        yeast growing in glucose (collected by L Bandiera).
        An alternative can be loaded from a file - a txt file of two columns
        with OD specified in the first column and the dilution factor specified
        in descending order in the second.

        Parameters
        ---------
        figs: boolean, optional
            If True, a plot of the fit to the dilution data is produced.
        odmatch: float, optional
            If non-zero, then the corrected OD is rescaled to equal the
            measured OD at this value. Only large ODs typically need to be
            corrected.
        experiments: string or list of strings
            The experiments to include.
        conditions: string or list of strings
            The conditions to include.
        experimentincludes: string, optional
            Selects only experiments that include the specified string in their
            name.
        experimentexcludes: string, optional
            Ignores experiments that include the specified string in their
            name.
        conditionincludes: string, optional
            Selects only conditions that include the specified string in their
            name.
        conditionexcludes: string, optional
            Ignores conditions that include the specified string in their name.

        Examples
        -------
        >>> p.correctOD()
        >>> p.correctOD(figs= False)
        """
        exps = sunder.getexps(
            self, experiments, experimentincludes, experimentexcludes
        )
        cons = sunder.getcons(
            self,
            conditions,
            conditionincludes,
            conditionexcludes,
            nomedia=False,
        )
        for exp in exps:
            for c in cons:
                if self.sc[
                    (self.sc.experiment == exp) & (self.sc.condition == c)
                ]["OD corrected"].any():
                    print(exp, ": OD is already corrected for", c)
                else:
                    # fit dilution data
                    if not self.progress["gc"][exp]:
                        ODfname = (
                            str(self.wdirpath / self.progress["ODfname"][exp])
                            if self.progress["ODfname"][exp]
                            else None
                        )
                        gc = corrections.findODcorrection(
                            self.wdirpath,
                            ODfname,
                            exp,
                            figs,
                            odmatch,
                        )
                        self.progress["gc"][exp] = gc
                        # copy gc to experiments with the same ODfname
                        for e in self.allexperiments:
                            if (
                                self.progress["ODfname"][e]
                                == self.progress["ODfname"][exp]
                            ):
                                self.progress["gc"][e] = gc
                    # correct all wells
                    gc.batchpredict(
                        self.r.query("experiment == @exp and condition == @c")[
                            "OD"
                        ].to_numpy()
                    )
                    # update r dataframe
                    self.r.loc[
                        (self.r.experiment == exp) & (self.r.condition == c),
                        "OD",
                    ] = gc.f
                    # flag corrections in summary stats dataframe
                    self.sc.loc[
                        (self.sc.experiment == exp) & (self.sc.condition == c),
                        "OD corrected",
                    ] = True
        # update s dataframe
        admin.update_s(self)

    ###
    # Media correction
    ###
[docs]    @clogger.log
    def correctmedia(
        self,
        datatypes="all",
        commonmedia=False,
        experiments="all",
        experimentincludes=False,
        experimentexcludes=False,
        conditions="all",
        conditionincludes=False,
        conditionexcludes=False,
        figs=False,
        log=True,
        frac=0.33,
    ):
        """
        Corrects OD or fluorescence for the OD or fluorescence of the media
        using data from wells marked Null.
        Uses lowess to smooth measurements of from all Null wells and subtracts
        this smoothed time series from the raw data.

        Parameters
        ----------
        datatypes: string or list of strings
            Data types to be corrected.
        commonmedia: string
            A condition containing Null wells that should be used to correct
            media for other conditions.
        experiments: string or list of strings
            The experiments to include.
        conditions: string or list of strings
            The conditions to include.
        experimentincludes: string, optional
            Selects only experiments that include the specified string in their
            name.
        experimentexcludes: string, optional
            Ignores experiments that include the specified string in their
            name.
        conditionincludes: string, optional
            Selects only conditions that include the specified string in their
            name.
        conditionexcludes: string, optional
            Ignores conditions that include the specified string in their name.
        figs: boolean, optional
            If True, display fits to data for the Null wells.
        frac: float
            The fraction of the data used for smoothing via lowess.
            https://www.statsmodels.org/devel/generated/statsmodels.nonparametric.smoothers_lowess.lowess.html

        Examples
        --------
        >>> p.correctmedia()
        >>> p.correctmedia('OD')
        >>> p.correctmedia(commonmedia= '1% Glu')
        """
        exps = sunder.getexps(
            self, experiments, experimentincludes, experimentexcludes
        )
        cons = sunder.getcons(
            self,
            conditions,
            conditionincludes,
            conditionexcludes,
            nomedia=False,
        )
        for exp in exps:
            # data types
            expdatatypes = (
                self.datatypes[exp]
                if datatypes == "all"
                else gu.makelist(datatypes)
            )
            # correct for media
            for dtype in expdatatypes:
                for c in cons:
                    if self.sc[
                        (self.sc.experiment == exp) & (self.sc.condition == c)
                    ][dtype + " corrected for media"].any():
                        print(
                            exp + ":",
                            dtype,
                            "is already corrected for media in",
                            c,
                        )
                    else:
                        print(exp + ": Correcting", dtype, "for media in", c)
                        cm = commonmedia if commonmedia else c
                        # update r dataframe
                        (
                            success,
                            negvalues,
                        ) = corrections.performmediacorrection(
                            self.r, dtype, exp, c, figs, cm, frac
                        )
                        if success:
                            self.sc.loc[
                                (self.sc.experiment == exp)
                                & (self.sc.condition == c),
                                dtype + " corrected for media",
                            ] = True
                            if negvalues:
                                if not self.progress["negativevalues"][exp]:
                                    self.progress["negativevalues"][
                                        exp
                                    ] = negvalues
                                else:
                                    self.progress["negativevalues"][
                                        exp
                                    ] += negvalues
            if self.progress["negativevalues"][exp]:
                print(
                    "\nWarning: correcting media has created negative "
                    "values in",
                    exp,
                    "for",
                )
                print(self.progress["negativevalues"][exp])
        # update s dataframe
        admin.update_s(self)

    ###
    # Statistical analysis
    ###
[docs]    @clogger.log
    def getstats(
        self,
        dtype="OD",
        bd=False,
        cvfn="matern",
        empirical_errors=False,
        noruns=10,
        exitearly=True,
        noinits=100,
        nosamples=100,
        logs=True,
        iskip=False,
        stats=True,
        figs=True,
        findareas=False,
        plotlocalmax=True,
        showpeakproperties=False,
        experiments="all",
        experimentincludes=False,
        experimentexcludes=False,
        conditions="all",
        conditionincludes=False,
        conditionexcludes=False,
        strains="all",
        strainincludes=False,
        strainexcludes=False,
        **kwargs,
    ):
        """
        Calls fitderiv.py to estimate the first and second time-derivatives of,
        typically, OD using a Gaussian process (Swain et al., 2016) and find
        corresponding summary statistics.

        The derivatives are stored in the .s dataframe; summary statistics are
        stored in the .sc dataframe.

        Parameters
        ----------
        dtype: string, optional
            The type of data - 'OD', 'GFP', 'c-GFPperOD', or 'c-GFP' - for
            which the derivatives are to be found. The data must exist in the
            .r or .s dataframes.
        bd: dictionary, optional
            The bounds on the hyperparameters for the Gaussian process.
            For example, bd= {1: [-2,0])} fixes the bounds on the
            hyperparameter controlling flexibility to be 1e-2 and 1e0.
            The default for a Matern covariance function
            is {0: (-5,5), 1: (-4,4), 2: (-5,2)},
            where the first element controls amplitude, the second controls
            flexibility, and the third determines the magnitude of the
            measurement error.
        cvfn: string, optional
            The covariance function used in the Gaussian process, either
            'matern' or 'sqexp' or 'nn'.
        empirical_errors: boolean, optional
            If True, measurement errors are empirically estimated from the
            variance across replicates at each time point and so vary with
            time.
            If False, the magnitude of the measurement error is fit from the
            data assuming that this magnitude is the same at all time points.
        noruns: integer, optional
            The number of attempts made for each fit. Each attempt is made
            with random initial estimates of the hyperparameters within their
            bounds.
        exitearly: boolean, optional
            If True, stop at the first successful fit.
            If False, use the best fit from all successful fits.
        noinits: integer, optional
            The number of random attempts to find a good initial condition
            before running the optimization.
        nosamples: integer, optional
            The number of samples used to calculate errors in statistics by
            bootstrapping.
        logs: boolean, optional
            If True, find the derivative of the log of the data and should be
            True to determine the specific growth rate when dtype= 'OD'.
        iskip: integer, optional
            Use only every iskip'th data point to increase speed.
        stats: boolean, optional
            If False, do not calculate statistics.
        figs: boolean, optional
            If True, plot both the fits and inferred derivative.
        findareas: boolean, optional
            If True, find the area under the plot of gr vs OD and the area
            under the plot of OD vs time. Setting to True can make getstats
            slow.
        plotlocalmax: boolean, optional
            If True, mark the highest local maxima found, which is used to
            calculate statistics, on any plots.
        showpeakproperties: boolean, optional
            If True, show properties of any local peaks that have found by
            scipy's find_peaks. Additional properties can be specified as
            kwargs and are passed to find_peaks.
        experiments: string or list of strings
            The experiments to include.
        conditions: string or list of strings
            The conditions to include.
        strains: string or list of strings
            The strains to include.
        experimentincludes: string, optional
            Selects only experiments that include the specified string in their
            name.
        experimentexcludes: string, optional
            Ignores experiments that include the specified string in their
            name.
        conditionincludes: string, optional
            Selects only conditions that include the specified string in their
            name.
        conditionexcludes: string, optional
            Ignores conditions that include the specified string in their name.
        strainincludes: string, optional
            Selects only strains that include the specified string in their
            name.
        strainexcludes: string, optional
            Ignores strains that include the specified string in their name.
        kwargs: for scipy's find_peaks
            To set the minimum property of a peak. e.g. prominence= 0.1 and
            width= 15 (specified in numbers of x-points or y-points and not
            real units).
            https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.find_peaks.html

        Examples
        --------
        >>> p.getstats()
        >>> p.getstats(conditionincludes= 'Gal')
        >>> p.getstats(noruns= 10, exitearly= False)

        If the fits are poor, often changing the bounds on the hyperparameter
        for the measurement error helps:

        >>> p.getstats(bd= {2: (-3,0)})

        References
        ----------
        PS Swain, K Stevenson, A Leary, LF Montano-Gutierrez, IB Clark,
        J Vogel, T Pilizota. (2016). Inferring time derivatives including cell
        growth rates using Gaussian processes. Nat Commun, 7, 1-8.
        """
        linalgmax = 5
        warnings = ""
        if dtype == "OD" and logs:
            derivname = "gr"
        else:
            derivname = "d/dt " + dtype
        snames = [
            "max " + derivname,
            "time of max " + derivname,
        ]
        if dtype == "OD" and logs:
            # special names with estimating specific growth rate
            snames += ["doubling time", "lag time"]
        else:
            snames += [
                "doubling time from " + derivname,
                "lag time from " + derivname,
            ]
        if logs:
            ylabels = ["log(" + dtype + ")", derivname]
        else:
            ylabels = [dtype, derivname]
        # extract data
        exps, cons, strs = sunder.getall(
            self,
            experiments,
            experimentincludes,
            experimentexcludes,
            conditions,
            conditionincludes,
            conditionexcludes,
            strains,
            strainincludes,
            strainexcludes,
        )
        # find growth rate and stats
        for e in exps:
            for c in cons:
                for s in strs:
                    figtitle = e + ": " + s + " in " + c
                    if dtype in self.r.columns:
                        # raw data
                        d = sunder.extractwells(self.r, self.s, e, c, s, dtype)
                        t = self.s.query(
                            "experiment == @e and condition == @c and "
                            "strain == @s"
                        )["time"].to_numpy()
                    elif dtype in self.s.columns:
                        # processed data
                        df = self.s.query(
                            "experiment == @e and condition == @c and "
                            "strain == @s"
                        )
                        # add columns plus and minus err
                        df = omplot.augmentdf(df, dtype)[
                            [dtype, "augtype", "time"]
                        ]
                        piv_df = df.pivot("time", "augtype", dtype)
                        # convert to array for fitderiv
                        d = piv_df.values
                        t = piv_df.index.to_numpy()
                        numberofnans = np.count_nonzero(np.isnan(d))
                        if np.any(numberofnans):
                            print(f"\nWarning: {numberofnans} NaNs in data")
                    else:
                        print(dtype, "not recognized for", figtitle)
                        return
                    # checks
                    if d.size == 0:
                        # no data
                        print("No data found for", dtype, "for", figtitle)
                        break
                    if logs:
                        print("\nFitting log(" + dtype + ") for", figtitle)
                    else:
                        print("\nFitting", dtype, "for", figtitle)
                    # call fitderiv
                    f = fitderiv(
                        t,
                        d,
                        cvfn=cvfn,
                        logs=logs,
                        bd=bd,
                        empirical_errors=empirical_errors,
                        statnames=snames,
                        noruns=noruns,
                        noinits=noinits,
                        exitearly=exitearly,
                        linalgmax=linalgmax,
                        nosamples=nosamples,
                        iskip=iskip,
                    )
                    if f.success:
                        if figs:
                            plt.figure()
                            plt.subplot(2, 1, 1)
                            f.plotfit(
                                "f", ylabel=ylabels[0], figtitle=figtitle
                            )
                            axgr = plt.subplot(2, 1, 2)
                            f.plotfit("df", ylabel=ylabels[1])
                            plt.tight_layout()
                        # find summary statistics
                        outdf, statsdict, warning = omstats.findsummarystats(
                            dtype,
                            derivname,
                            logs,
                            nosamples,
                            f,
                            t,
                            e,
                            c,
                            s,
                            findareas,
                            figs,
                            plotlocalmax,
                            axgr,
                            showpeakproperties,
                            **kwargs,
                        )
                        if warning:
                            warnings += warning
                        # store results in instance's dataframes
                        statsdict[
                            "logmaxlikehood for " + derivname
                        ] = f.logmaxlike
                        statsdict["gp for " + derivname] = cvfn
                        if stats:
                            for sname in f.ds.keys():
                                statsdict[sname] = f.ds[sname]
                        # add growth rates, etc., to dataframe of summary data
                        if derivname not in self.s.columns:
                            # add new columns to dataframe
                            self.s = pd.merge(self.s, outdf, how="outer")
                        else:
                            # update dataframe
                            self.s = gu.absorbdf(
                                self.s,
                                outdf,
                                ["experiment", "condition", "strain", "time"],
                            )
                        # create or add summary stats to stats dataframe
                        statsdf = pd.DataFrame(
                            statsdict, index=pd.RangeIndex(0, 1, 1)
                        )
                        newstats = np.count_nonzero(
                            [
                                True if stat not in self.sc.columns else False
                                for stat in statsdict
                            ]
                        )
                        if newstats:
                            # add new columns to dataframe
                            self.sc = pd.merge(self.sc, statsdf, how="outer")
                        else:
                            # update dataframe
                            self.sc = gu.absorbdf(
                                self.sc,
                                statsdf,
                                ["experiment", "condition", "strain"],
                            )
                        if figs:
                            plt.show()
        omstats.cleansc(self)
        if warnings:
            print(warnings)

    ###
[docs]    @clogger.log
    def averageoverexpts(
        self,
        condition,
        strain,
        tvr="OD mean",
        bd=False,
        addnoise=True,
        plot=False,
    ):
        """
        Uses a Matern Gaussian process to average a time-dependent variable
        over all experiments.

        An alternative and best first choice is to use addcommonvar.

        Parameters
        ----------
        condition: string
            The condition of interest.
        strain: string
            The strain of interest.
        tvr: float
            The time-dependent variable to be averaged.
            For example, 'c-GFPperOD' or 'OD mean'.
        bd: dictionary, optional
            The limits on the hyperparameters for the Matern Gaussian process.
            For example, {0: (-5,5), 1: (-4,4), 2: (-5,2)}
            where the first element controls amplitude, setting the bounds to
            1e-5 and 1e5, the second controls flexibility, and the third
            determines the magnitude of the measurement error.
        addnoise: boolean
            If True, add the fitted magnitude of the measurement noise to the
            predicted standard deviation for better comparison with the spread
            of the data.

        Returns
        -------
        res: dictionary
            {'t' : time, tvr : time-dependent data, 'mn' : mean,
            'sd' : standard deviation}
            where 'mn' is the average found and 'sd' is its standard deviation.
            'tvr' is the data used to find the average.

        Examples
        --------
        >>> p.averageoverexpts('1% Gal', 'GAL2', bd= {1: [-1,-1])})
        """
        # boundaries on hyperparameters
        if "OD" in tvr:
            bds = {0: (-4, 4), 1: (-1, 4), 2: (-6, 2)}
        else:
            bds = {0: (2, 12), 1: (-1, 4), 2: (4, 10)}
        if bd:
            bds = gu.mergedicts(original=bds, update=bd)
        # extract data
        df = self.s[["experiment", "condition", "strain", "time", tvr]]
        ndf = df.query("condition == @condition and strain == @strain")
        # use GP to average over experiments
        x = ndf["time"].to_numpy()
        y = ndf[tvr].to_numpy()
        ys = y[np.argsort(x)]
        xs = np.sort(x)
        g = gp.maternGP(bds, xs, ys)
        print(
            "averaging over", tvr, "experiments for", strain, "in", condition
        )
        g.findhyperparameters(noruns=2, noinits=1000)
        g.results()
        g.predict(xs, addnoise=addnoise)
        if plot:
            plt.figure()
            g.sketch(".")
            plt.title("averaging " + strain + " in " + condition)
            plt.xlabel("time")
            plt.ylabel(tvr)
            plt.show()
        # return results as a dictionary
        res = {"t": xs, tvr: ys, "mn": g.f, "sd": np.sqrt(g.fvar)}
        return res

    ###
    # Fluorescence corrections
    ###
[docs]    @clogger.log
    def correctauto(
        self,
        f=["GFP", "AutoFL"],
        refstrain="WT",
        figs=True,
        experiments="all",
        experimentincludes=False,
        experimentexcludes=False,
        conditions="all",
        conditionincludes=False,
        conditionexcludes=False,
        strains="all",
        strainincludes=False,
        strainexcludes=False,
    ):
        """
        Corrects fluorescence data for autofluorescence by comparing with the
        fluorescence of an untagged reference strain.

        The reference strain is used to estimate the autofluoresence via either
        the method of Licthen et al., 2014, where measurements of fluoescence
        at two wavelengths is required, or by using the fluorescence of the
        reference strain interpolated to the OD of the strain of interest
        (Berthoumieux et al., 2013).

        Using two measurements of fluorescence is thought to be more accurate,
        particularly for low fluorescence measurements (Mihalcescu et al.,
        2015).

        Arguments
        --
        f: string or list of strings
            The fluorescence measurements, typically either ['mCherry'] or
            ['GFP', 'AutoFL'].
        refstrain: string
            The reference strain.
        figs: boolean
            If True, display plots showing the fits to the reference strain's
            fluorescnce.
        experiments: string or list of strings
            The experiments to include.
        conditions: string or list of strings
            The conditions to include.
        strains: string or list of strings
            The strains to include.
        experimentincludes: string, optional
            Selects only experiments that include the specified string in
            their name.
        experimentexcludes: string, optional
            Ignores experiments that include the specified string in their
            name.
        conditionincludes: string, optional
            Selects only conditions that include the specified string in their
            name.
        conditionexcludes: string, optional
            Ignores conditions that include the specified string in their name.
        strainincludes: string, optional
            Selects only strains that include the specified string in their
            name.
        strainexcludes: string, optional
            Ignores strains that include the specified string in their name.

        Notes
        -----
        In principle

        >>> p.correctmedia()

        should be run before running correctauto when processing data with two
        fluorescence measurements.

        It is unnecessary with only one fluorescence measurement because the
        normalisation is then done directly with the reference strain's
        fluorescence and this fluorescence can include the fluorescence from
        the media.

        In practice, running correctmedia may generate negative values of the
        fluorescence at some time points. These negative values will create
        NaNs in the corrected fluorescence, which are normally harmless.

        With sufficiently many negative values of the fluorescence, however,
        correcting data with two fluorescence measurements can become
        corrupted.

        If correctmedia generates negative fluorescence values, we therefore
        recommend comparing the corrected fluorescence between

        >>> p.correctmedia()
        >>> p.correctauto(['GFP', 'AutoFL')

        and

        >>> p.correctauto('GFP')

        to determine if these negative values are deleterious.

        Examples
        --------
        To correct data with one type of fluorescence measurement, use:

        >>> p.correctauto('GFP')
        >>> p.correctauto('mCherry', refstrain= 'BY4741')

        To correct data with two types of fluorescence measurement, use:

        >>> p.correctauto(['GFP', 'AutoFL'])
        >>> p.correctauto(['GFP', 'AutoFL'], refstrain= 'wild-type')

        References
        ----------
        S Berthoumieux, H De Jong, G Baptist, C Pinel, C Ranquet, D Ropers,
        J Geiselmann (2013).
        Shared control of gene expression in bacteria by transcription factors
        and global physiology of the cell.
        Mol Syst Biol, 9, 634.

        CA Lichten, R White, IB Clark, PS Swain (2014).
        Unmixing of fluorescence spectra to resolve quantitative time-series
        measurements of gene expression in plate readers.
        BMC Biotech, 14, 1-11.

        I Mihalcescu, MVM Gateau, B Chelli, C Pinel, JL Ravanat (2015).
        Green autofluorescence, a double edged monitoring tool for bacterial
        growth and activity in micro-plates.
        Phys Biol, 12, 066016.

        """
        f = gu.makelist(f)
        exps, cons, strs = sunder.getall(
            self,
            experiments,
            experimentincludes,
            experimentexcludes,
            conditions,
            conditionincludes,
            conditionexcludes,
            strains,
            strainincludes,
            strainexcludes,
        )
        # check for negative fluorescence values
        for e in exps:
            for c in cons:
                if self.progress["negativevalues"][e]:
                    for datatype in f:
                        if (
                            datatype in self.progress["negativevalues"][e]
                            and c in self.progress["negativevalues"][e]
                        ):
                            print(
                                e + ": The negative values for",
                                datatype,
                                "in",
                                c,
                                "will generate NaNs",
                            )
        # going ahead
        print("Using", refstrain, "as the reference")
        # correct for autofluorescence
        if len(f) == 2:
            corrections.correctauto2(
                self,
                f,
                refstrain,
                figs,
                experiments,
                experimentincludes,
                experimentexcludes,
                conditions,
                conditionincludes,
                conditionexcludes,
                strains,
                strainincludes,
                strainexcludes,
            )
        elif len(f) == 1:
            corrections.correctauto1(
                self,
                f,
                refstrain,
                figs,
                experiments,
                experimentincludes,
                experimentexcludes,
                conditions,
                conditionincludes,
                conditionexcludes,
                strains,
                strainincludes,
                strainexcludes,
            )
        else:
            print("f must be a list of length 1 or 2")

    ###
    # Logging
    ###
    @property
    def log(self):
        """
        Prints a log of all methods called and their arguments.

        Example
        -------
        >>> p.log
        """
        print(self.logstream.getvalue())

    ###

[docs]    def savelog(self, fname=None):
        """
        Save log to file.

        Parameters
        --
        fname: string, optional
            The name of the file. If unspecified, the name of the experiment.

        Example
        -------
        >>> p.savelog()
        """
        # export log
        if fname:
            fnamepath = self.wdirpath / (fname + ".log")
        else:
            fnamepath = self.wdirpath / ("".join(self.allexperiments) + ".log")
        with fnamepath.open("w") as f:
            f.write(self.logstream.getvalue())
        print("Exported to", str(fnamepath))

    ###
    # Exporting and importing
    ###
[docs]    @clogger.log
    def exportdf(self, commonname=False, type="tsv"):
        """
        Exports the dataframes as either tab-delimited or csv or json files.
        Dataframes for the (processed) raw data, for summary data, and for
        summary statistics and corrections, as well as a log file, are
        exported.

        Parameters
        ----------
        commonname: string, optional
            The name used for the output files.
            If unspecified, the experiment or experiments is used.
        type: string
            The type of file for export, either 'json' or 'csv' or 'tsv'.

        Examples
        --------
        >>> p.exportdf()
        >>> p.exportdf('processed', type= 'json')
        """
        if commonname:
            fullcommonname = str(self.wdirpath / commonname)
        else:
            fullcommonname = str(self.wdirpath / "".join(self.allexperiments))
        # export data
        if type == "json":
            self.r.to_json(fullcommonname + "_r.json", orient="split")
            self.s.to_json(fullcommonname + "_s.json", orient="split")
            self.sc.to_json(fullcommonname + "_sc.json", orient="split")
        else:
            sep = "\t" if type == "tsv" else ","
            self.r.to_csv(fullcommonname + "_r." + type, sep=sep, index=False)
            self.s.to_csv(fullcommonname + "_s." + type, sep=sep, index=False)
            self.sc.to_csv(
                fullcommonname + "_sc." + type, sep=sep, index=False
            )
        # export log to file
        self.savelog(commonname)

    ###
[docs]    @clogger.log
    def importdf(self, commonnames, info=True, sep="\t"):
        """
        Import dataframes saved as either json or csv or tsv files.

        Parameters
        ----------
        commonnames: list of strings
            A list of names for the files to be imported with one string for
            each experiment.

        Examples
        --------
        >>> p.importdf('Gal')
        >>> p.importdf(['Gal', 'Glu', 'Raf'])
        """
        commonnames = gu.makelist(commonnames)
        # import data
        for commonname in commonnames:
            commonname = str(self.wdirpath / commonname)
            for df in ["r", "s", "sc"]:
                try:
                    # json files
                    exec(
                        "impdf= pd.read_json(commonname + '_' + df + "
                        "'.json', orient= 'split')"
                    )
                    print("Imported", commonname + "_" + df + ".json")
                except ValueError:
                    try:
                        # csv files
                        exec(
                            "impdf= pd.read_csv(commonname + '_' + df + "
                            "'.csv', sep= ',')"
                        )
                        print("Imported", commonname + "_" + df + ".csv")
                    except FileNotFoundError:
                        try:
                            # tsv files
                            exec(
                                "impdf= pd.read_csv(commonname + '_' + df "
                                "+ '.tsv', sep= '\t')"
                            )
                            print("Imported", commonname + "_" + df + ".tsv")
                        except FileNotFoundError:
                            print(
                                "No file called",
                                commonname
                                + "_"
                                + df
                                + ".json or .csv or .tsv found",
                            )
                            return
                # ensure all are imported as strings
                for var in ["experiment", "condition", "strain"]:
                    exec("impdf[var]= impdf[var].astype(str)")
                # merge dataframes
                if hasattr(self, df):
                    exec(
                        "self."
                        + df
                        + "= pd.merge(self."
                        + df
                        + ", impdf, how= 'outer')"
                    )
                else:
                    exec("self." + df + "= impdf")
            print()

        # update attributes
        self.allexperiments = list(self.s.experiment.unique())
        self.allconditions.update(
            {
                e: list(self.s[self.s.experiment == e].condition.unique())
                for e in self.allexperiments
            }
        )
        self.allstrains.update(
            {
                e: list(self.s[self.s.experiment == e].strain.unique())
                for e in self.allexperiments
            }
        )

        # find datatypes with mean in self.s
        dtypdict = {}
        for e in self.allexperiments:
            # drop columns of NaNs - these are created by merge if a datatype
            # is in one experiment but not in another
            tdf = self.s[self.s.experiment == e].dropna(axis=1, how="all")
            dtypdict[e] = list(tdf.columns[tdf.columns.str.contains("mean")])
        self.datatypes.update(
            {e: [dt.split(" mean")[0] for dt in dtypdict[e]] for e in dtypdict}
        )
        # initialise progress
        for e in self.allexperiments:
            admin.initialiseprogress(self, e)
        # display info on import
        if info:
            self.info

        # display warning if duplicates created
        if len(self.allexperiments) != np.unique(self.allexperiments).size:
            print(
                "\nLikely ERROR: data with the same experiment, condition, "
                "strain, and time now appears twice!!"
            )


###
if __name__ == "__main__":
    print(platereader.__doc__)
omniplate 0.4.92 documentation

Source code for omniplate.Omniplate