# !/usr/bin/env python
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
import re
from pathlib import Path
import gaussianprocessderivatives as gp
import om_code.omgenutils as gu
from om_code.omfitderiv import fitderiv
import om_code.omplot as omplot
import om_code.loadplatedata as loadplatedata
import om_code.omerrors as errors
import om_code.corrections as corrections
import om_code.sunder as sunder
import om_code.omstats as omstats
import om_code.admin as admin
import om_code.clogger as clogger
version = "0.9.42"
plt.rcParams["figure.max_open_warning"] = 0
sns.set()
[docs]class platereader:
"""
for analyzing plate-reader data, correcting for autofluorescence, and
determining growth rates.
All data is stored used Panda's dataframes and plotted using Seaborn.
Three dataframes are created. If p is an instance of the platereader class,
then p.r contains the raw data for each well in the plate; p.s contains the
processed time-series using the data from all relevant wells; and p.sc
constains any summary statistics, such as 'max gr'.
For time series sampled from a Gaussian process, the mean is used as the
statistics and errors are estimated by the standard deviation.
For statistics calculated from time series, the median is used and errors
are estimated by half the interquartile range, with the distribution of
the statistic calculated by sampling time series.
Examples
-------
A typical work flow is:
>>> import omniplate as om
then either
>>> p= om.platereader('GALdata.xlsx', 'GALcontents.xlsx',
... wdir= 'data/')
or
>>> p= om.platereader()
>>> p.load('GALdata.xls', 'GALcontents.xlsx')
and to analyse OD data
>>> p.plot('OD', plate= True)
>>> p.correctOD()
>>> p.correctmedia()
>>> p.plot(y= 'OD')
>>> p.plot(y= 'OD', hue= 'strain',
... conditionincludes= ['Gal', 'Glu'],
... strainexcludes= 'HXT7')
>>> p.getstats('OD')
and for fluorescence data
>>> p.correctauto(['GFP', 'AutoFL'])
>>> p.plot(y= 'c-GFPperOD', hue= 'condition')
and to save the results
>>> p.savefigs()
>>> p.exportdf()
General properties of the data and of previous processing are shown with:
>>> p.info
>>> p.attributes()
>>> p.corrections()
>>> p.log
See also
http://swainlab.bio.ed.ac.uk/software/omniplate/index.html
for a tutorial, which can be opened directly using
>>> p.webhelp()
"""
# ratio of fluorescence emission at 585nm to emiisions at 525nm for eGFP
_gamma = 0.114
###
[docs] def __init__(
self,
dnames=False,
anames=False,
wdir=".",
platereadertype="Tecan",
dsheetnumbers=False,
asheetnumbers=False,
ODfname=None,
info=True,
ls=True,
):
"""
Initiate and potentially immediately load data for processing.
Parameters
----------
dnames: string or list of strings, optional
The name of the file containing the data from the plate reader or
a list of file names.
anames: string or list of strings, optional
The name of file containing the corresponding annotation or a list
of file names.
wdir: string, optional
The working directory where the data files are stored and where
output will be saved.
platereadertype: string
The type of plate reader, currently either 'Tecan' or 'Sunrise' or
'old Tecan'.
dsheetnumbers: integer or list of integers, optional
The relevant sheets of the Excel files storing the data.
asheetnumbers: integer or list of integers, optional
The relevant sheets of the corresponding Excel files for the
annotation.
ODfname: string, optional
The name of the file with the dilution data used to correct OD for
its non-linear dependence on numbers of cells. If unspecified, data
for haploid budding yeast growing in glucose is used.
info: boolean
If True (default), display summary information on the data once
loaded.
ls: boolean
If True (default), display contents of working directory.
"""
self.__version__ = version
print("\nomniplate version=", self.__version__)
# absolute path
self.wdirpath = Path(wdir)
# enable logging
# self._initialiselogging()
# self._logmethod(self.logger)
self.logger, self.logstream = clogger.initlog(version)
if True:
# warning generated occasionally when sampling from the Gaussian
# process likely because of numerical errors
warnings.simplefilter("ignore", RuntimeWarning)
# dictionary recording extent of analysis
self.progress = {
"ignoredwells": {},
"negativevalues": {},
"ODfname": {},
"gc": {},
}
self.allexperiments = []
self.allconditions = {}
self.allstrains = {}
self.datatypes = {}
if dnames is False:
# list all files in current directory
if ls:
self.ls
else:
# immediately load data
self.load(
dnames,
anames,
platereadertype,
dsheetnumbers,
asheetnumbers,
ODfname,
info,
)
###
def __repr__(self):
repstr = "{} v{}: ".format(self.__class__.__name__, self.__version__)
for e in self.allexperiments:
repstr += e + " ; "
if repstr[-2:] == "; ":
repstr = repstr[:-3]
return repstr
###
@property
def ls(self):
"""
List all files in the working directory.
A dictionary of available datasets is created as a shortcut.
Example
-------
>>> p.ls
>>> p.ds
>>> p.load(p.ds[0], p.ds[1])
"""
self.ds = {}
print("Working directory is", str(self.wdirpath.resolve()))
print("Files available are:", "\n---")
files = []
for f in self.wdirpath.glob("*.*"):
if f.is_file() and (
f.suffix == ".xlsx"
or f.suffix == ".json"
or f.suffix == ".tsv"
or f.suffix == ".csv"
or f.suffix == ".xls"
):
print(f.stem + f.suffix)
files.append(f.stem + f.suffix)
print()
self.ds = {i: f for i, f in enumerate(sorted(files))}
###
[docs] def changewdir(self, wdir):
"""
Change working directory.
Parameters
----------
wdir: string
The new working directory specified from the current directory.
Example
-------
>>> p.changewdir('newdata/')
"""
self.wdirpath = Path(wdir)
self.ls
###
[docs] @clogger.log
def load(
self,
dnames,
anames=False,
platereadertype="Tecan",
dsheetnumbers=False,
asheetnumbers=False,
ODfname=None,
info=True,
):
"""
Loads raw data files generated by the plate reader and the
corresponding annotation files.
Parameters
----------
dnames: string or list of strings, optional
The name of the file containing the data from the plate reader
or a list of file names.
anames: string or list of strings, optional
The name of file containing the corresponding annotation or a list
of file names.
platereadertype: string
The type of plate reader, currently either 'Tecan' or 'Sunrise' or
'old Tecan'.
dsheetnumbers: integer or list of integers, optional
The relevant sheets of the Excel files storing the data.
asheetnumbers: integer or list of integers, optional
The relevant sheets of the corresponding Excel files for the
annotation.
ODfname: string, optional
The name of the file with the dilution data used to correct OD for
its non-linear dependence on numbers of cells. If unspecified, data
for haploid budding yeast growing in glucose is used.
info: boolean
If True (default), display summary information on the data once
loaded.
Examples
-------
>>> p.load('Data.xlsx', 'DataContents.xlsx')
>>> p.load('Data.xlsx', 'DataContents.xlsx', info= False)
>>> p.load('Data.xlsx', 'DataContents.xlsx',
... ODfname= 'ODcorrection_Glucose_Diploid.txt')
"""
dnames = gu.makelist(dnames)
if not anames:
anames = [
dname.split(".")[0] + "Contents.xlsx" for dname in dnames
]
else:
anames = gu.makelist(anames)
if not dsheetnumbers:
dsheetnumbers = [0 for dname in dnames]
if not asheetnumbers:
asheetnumbers = [0 for dname in dnames]
alldata = {}
for i, dname in enumerate(dnames):
# get dataframe for raw data
(
rdf,
allconditionssingle,
allstrainssingle,
alldatasingle,
experiment,
datatypes,
) = loadplatedata.loadfromplate(
platereadertype,
self.wdirpath,
dname,
dsheetnumbers[i],
anames[i],
asheetnumbers[i],
)
self.allexperiments.append(experiment)
self.allconditions[experiment] = allconditionssingle
self.allstrains[experiment] = allstrainssingle
self.datatypes[experiment] = datatypes
alldata.update(alldatasingle)
self.r = (
pd.merge(self.r, rdf, how="outer")
if hasattr(self, "r")
else rdf
)
# update progress dictionary
admin.initialiseprogress(self, experiment)
# define ODfname in progress dictionary
if ODfname:
if isinstance(ODfname, str):
self.progress["ODfname"] = {
exp: ODfname for exp in self.allexperiments
}
else:
self.progress["ODfname"] = {
exp: ODfname[i]
for i, exp in enumerate(self.allexperiments)
}
else:
self.progress["ODfname"] = {
exp: None for exp in self.allexperiments
}
# dataframe for summary stats and corrections
alldfs = []
# for exp in self.allexperiments:
for exp in alldata:
strs, cons = [], []
for cs in alldata[exp]:
strs.append(cs.split(" in ")[0])
cons.append(cs.split(" in ")[1])
corrdict = {
"experiment": exp,
"strain": strs,
"condition": cons,
"OD corrected": False,
}
corrdict.update(
{
dtype + " corrected for media": False
for dtype in self.datatypes[exp]
}
)
corrdict.update(
{
dtype + " corrected for autofluorescence": False
for dtype in self.datatypes[exp]
if dtype not in ["AutoFL", "OD"]
}
)
alldfs.append(pd.DataFrame(corrdict))
self.sc = pd.concat(alldfs)
# dataframe of original data
self.origr = self.r.copy()
# dataframe for well content
self.wellsdf = admin.makewellsdf(self.r)
# dataframe for summary data
self.s = admin.make_s(self)
# display info on experiment, conditions and strains
if info:
self.info
print(
'\nWarning: wells with no strains have been changed to "Null"'
"\nto avoid confusion with pandas.\n"
)
###
# Routines to display information on data and state of data processing
###
@property
def info(self):
"""
Displays conditions, strains, and datatypes.
Example
-------
>>> p.info
"""
for exp in self.allexperiments:
print("\nExperiment:", exp, "\n---")
print("Conditions:")
for c in sorted(self.allconditions[exp], key=gu.natural_keys):
print("\t", c)
print("Strains:")
for s in sorted(self.allstrains[exp], key=gu.natural_keys):
print("\t", s)
print("Data types:")
for d in self.datatypes[exp]:
print("\t", d)
if self.progress["ignoredwells"]:
print("Ignored wells:")
if self.progress["ignoredwells"][exp]:
for d in self.progress["ignoredwells"][exp]:
print("\t", d)
else:
print("\t", "None")
print()
###
[docs] def webhelp(self, browser=None):
"""
Opens detailed examples of how to use in omniplate in a web browser.
Parameters
----------
browser: string, optional
The browser to use - either the default if unspecified or 'firefox',
'chrome', etc.
Example
-------
>>> p.webhelp()
"""
import webbrowser
url = "https://swainlab.bio.ed.ac.uk/software/omniplate/index.html"
webbrowser.get(browser).open_new(url)
###
@property
def attributes(self):
"""
Displays the names of the attributes of the current instance of
platereader and acts as a check to see what variables have been
calculated or determined.
Example
-------
>>> p.attributes
"""
ignore = [
"d",
"consist",
"t",
"nosamples",
"_gamma",
"ODfname",
"overflow",
"nooutchannels",
"nodata",
"__doc__",
]
for a in self.__dict__:
if (
"corrected" not in a
and "processed" not in a
and a not in ignore
):
print(a)
###
[docs] @clogger.log
def rename(self, translatedict):
"""
Uses a dictionary to replace all occurrences of a strain or a condition
with an alternative.
Note that instances of self.progress will not be updated.
Parameters
----------
translatedict: dictionary
A dictionary of old name - new name pairs
Example
-------
>>> p.rename({'77.WT' : 'WT', '409.Hxt4' : 'Hxt4'})
"""
# replace in dataframes
for df in [self.r, self.s, self.sc]:
df.replace(translatedict, inplace=True)
# rename in attributes
def applydict(a):
return translatedict[a] if a in translatedict else a
for e in self.allexperiments:
self.allconditions[e] = list(map(applydict, self.allconditions[e]))
self.allstrains[e] = list(map(applydict, self.allstrains[e]))
###
[docs] def corrections(
self,
experiments="all",
conditions="all",
strains="all",
experimentincludes=False,
experimentexcludes=False,
conditionincludes=False,
conditionexcludes=False,
strainincludes=False,
strainexcludes=False,
):
"""
Displays the status of corrections made for the specified strains,
conditions, and experiments.
Parameters
----------
experiments: string or list of strings
The experiments to include.
conditions: string or list of strings
The conditions to include.
strains: string or list of strings
The strains to include.
experimentincludes: string, optional
Selects only experiments that include the specified string in their
name.
experimentexcludes: string, optional
Ignores experiments that include the specified string in their
name.
conditionincludes: string, optional
Selects only conditions that include the specified string in their
name.
conditionexcludes: string, optional
Ignores conditions that include the specified string in their name.
strainincludes: string, optional
Selects only strains that include the specified string in their
name.
strainexcludes: string, optional
Ignores strains that include the specified string in their name.
Returns
-------
df: dataframe
Contains the status of the corrections for the specified strains,
conditions, and experiments.
Examples
--------
>>> p.corrections()
>>> p.corrections(strainincludes= 'GAL')
"""
exps, cons, strs = sunder.getall(
self,
experiments,
experimentincludes,
experimentexcludes,
conditions,
conditionincludes,
conditionexcludes,
strains,
strainincludes,
strainexcludes,
)
df = self.sc.query(
"experiment == @exps and condition == @cons and strain == @strs"
)
# only show corrections and not stats
df = df[
["experiment", "strain", "condition"]
+ [col for col in df.columns if "correct" in col]
]
df = df.T
return df
###
[docs] @clogger.log
def addcolumn(self, newcolumnname, oldcolumn, newcolumnvalues):
"""
Adds a new column to all dataframes by parsing an existing column.
All possible entries for the new column are specified as strings and
the entry in the new column will be whichever of these strings is
present in the entry of the existing column.
Parameters
----------
newcolumnname: string
The name of the new column.
oldcolumn: string
The name of the column to be parsed to create the new column.
newcolumnvalues: list of strings
All of the possible values for the entries in the new column.
Example
-------
>>> p.addcolumn('medium', 'condition', ['Raffinose',
... 'Geneticin'])
will parse each entry in 'condition' to create a new column called
'medium' that has either a value 'Raffinose' if 'Raffinose' is in the
entry from 'condition' or a value 'Geneticin' if 'Geneticin' is in the
entry from 'condition'.
"""
for df in [self.r, self.s, self.sc]:
newcol = np.array(
("",) * len(df[oldcolumn].to_numpy()), dtype="object"
)
for i, oldcolvalue in enumerate(df[oldcolumn].to_numpy()):
for newcolvalue in newcolumnvalues:
if newcolvalue in oldcolvalue:
newcol[i] = newcolvalue
df[newcolumnname] = newcol
###
[docs] @clogger.log
def addnumericcolumn(
self,
newcolumnname,
oldcolumn,
picknumber=0,
leftsplitstr=None,
rightsplitstr=None,
asstr=False,
):
"""
Adds a new numeric column by parsing the numbers from the entries of
an existing column.
It is best to run this command only after the basic analyses -
ignorewells, correctOD, and correctmedia - have been performed because
it changes the structure of the dataframes and may cause errors.
Parameters
----------
newcolumnname: string
The name of the new column.
oldcolumn: string
The name of column to be parsed.
picknumber: integer
The number to pick from the list of numbers extracted from the
existing column's entry.
leftsplitstr: string, optional
Split the entry of the column using whitespace and parse numbers
from the substring to the immediate left of leftsplitstr rather
than the whole entry.
rightsplitstr: string, optional
Split the entry of the column using whitespace and parse numbers
from the substring to the immediate right of rightsplitstr rather
than the whole entry.
asstr: boolean
If True, convert the numeric value to a string to improve plots
with seaborn.
Examples
--------
To extract concentrations from conditions use
>>> p.addnumericcolumn('concentration', 'condition')
For a condition like '0.5% Raf 0.05ug/mL Cycloheximide', use
>>> p.addnumericcolumn('raffinose', 'condition',
... picknumber= 0)
>>> p.addnumericcolumn('cycloheximide', 'condition',
... picknumber= 1)
"""
# process splitstrs
if leftsplitstr or rightsplitstr:
splitstr = leftsplitstr if leftsplitstr else rightsplitstr
locno = -1 if leftsplitstr else 1
else:
splitstr = False
# change each dataframe
for df in [self.r, self.s, self.sc]:
if asstr:
# new column of strings
newcol = np.full_like(
df[oldcolumn].to_numpy(), "", dtype="object"
)
else:
# new column of floats
newcol = np.full_like(
df[oldcolumn].to_numpy(), np.nan, dtype="float"
)
# parse old column
for i, oldcolvalue in enumerate(df[oldcolumn].to_numpy()):
if oldcolvalue:
# split string first on spaces and then find substring
# adjacent to specified splitstring
if splitstr:
if splitstr in oldcolvalue:
# oldcolvalue contains leftsplitstring or
# rightsplitstring
bits = oldcolvalue.split()
for k, bit in enumerate(bits):
if splitstr in bit:
loc = k + locno
break
# adjacent string
oldcolvalue = bits[loc]
else:
# oldcolvalue does not contain leftsplitstring
# or rightsplitstring
oldcolvalue = ""
# loop through all floats in oldcolvalue
nocount = 0
for ci in re.split(
r"[+-]?([0-9]+(?:[.][0-9]*)?|[.][0-9]+)", oldcolvalue
):
try:
no = float(ci)
if nocount == picknumber:
newcol[i] = ci if asstr else no
break
nocount += 1
except ValueError:
pass
df[newcolumnname] = newcol
###
[docs] @clogger.log
def add_to_sc(
self,
newcolumn=None,
s_column=None,
func=None,
experiments="all",
experimentincludes=False,
experimentexcludes=False,
conditions="all",
conditionincludes=False,
conditionexcludes=False,
strains="all",
strainincludes=False,
strainexcludes=False,
):
"""
Applies func to a column in the s dataframe and
stores the results in the sc dataframe.
Parameters
----------
newcolumn: string
The name of the new column in the sc dataframe
s_column: string
The name of the column in s dataframe from which the
data is to be processed
func: function
The function to be applied to the data in the s dataframe.
Examples
--------
>>> p.add_to_sc(newcolumn= "max GFP", s_column= "GFP mean",
... func= np.nanmax)
>>> p.add_to_sc(newcolumn= "GFP lower quartile", s_column= "GFP mean",
... func= lambda x: np.nanquantile(x, 0.25))
"""
# extract data
exps, cons, strs = sunder.getall(
self,
experiments,
experimentincludes,
experimentexcludes,
conditions,
conditionincludes,
conditionexcludes,
strains,
strainincludes,
strainexcludes,
)
self.sc[newcolumn] = np.nan
for e in exps:
for c in cons:
for s in strs:
d = self.s.query(
"experiment == @e and condition == @c and "
"strain == @s"
)[s_column].values
res = np.asarray(func(d))
if res.size == 1:
self.sc.loc[
(self.sc.experiment == e)
& (self.sc.condition == c)
& (self.sc.strain == s),
newcolumn,
] = func(d)
else:
print("func must return a single value")
return False
###
[docs] @clogger.log
def addcommonvar(
self,
var="time",
dvar=None,
varmin=None,
varmax=None,
figs=True,
experiments="all",
experimentincludes=False,
experimentexcludes=False,
conditions="all",
conditionincludes=False,
conditionexcludes=False,
strains="all",
strainincludes=False,
strainexcludes=False,
):
"""
Adds to time-dependent dataframes a common variable whose values only
come from a fixed array so that they are from the same array for all
experiments. This common variable allows averaging across experiments
and typically is time.
For example, the plate reader often does not perfectly increment time
between measurements and different experients can have slightly
different time points despite the plate reader having the same
settings. These unique times prevent seaborn from taking averages.
If experiments have measurements that start at the same time point and
have the same interval between measurements, then setting a commontime
for all experiments will allow seaborn to perform averaging.
The array of the common variable runs from varmin to varmax with an
interval dvar. These parameters are automatically calculated, but may
be specified.
Each instance of var is assigned a common value - the closest instance
of the common variable to the instance of var. Measurements are assumed
to the same for the true instance of var and for the assigned common
value, which may generate errors if these two are sufficiently
distinct.
An alternative method is averageoverexpts.
Parameters
----------
var: string
The variable from which the common variable is generated,
typically 'time'.
dvar: float, optional
The interval between the values comprising the common array.
varmin: float, optional
The minimum of the common variable.
varmax: float, optional
The maximum of the common variable.
figs: boolean
If True, generate plot to check if the variable and the common
variable generated from it are sufficiently close in value.
experiments: string or list of strings
The experiments to include.
conditions: string or list of strings
The conditions to include.
strains: string or list of strings
The strains to include.
experimentincludes: string, optional
Selects only experiments that include the specified string in their
name.
experimentexcludes: string, optional
Ignores experiments that include the specified string in their
name.
conditionincludes: string, optional
Selects only conditions that include the specified string in their
name.
conditionexcludes: string, optional
Ignores conditions that include the specified string in their name.
strainincludes: string, optional
Selects only strains that include the specified string in their
name.
strainexcludes: string, optional
Ignores strains that include the specified string in their name.
Example
-------
To plot averages of time-dependent variables over experiments, use for
example
>>> p.addcommonvar('time')
>>> p.plot(x= 'commontime', y= 'c-GFPperOD',
... hue= 'condition', style= 'strain')
"""
exps, cons, strs = sunder.getall(
self,
experiments,
experimentincludes,
experimentexcludes,
conditions,
conditionincludes,
conditionexcludes,
strains,
strainincludes,
strainexcludes,
nonull=True,
)
print("Finding common" + var)
for df in [self.r, self.s]:
if var in df:
loc = (
df.experiment.isin(exps)
& df.condition.isin(cons)
& df.strain.isin(strs)
)
print("r dataframe") if df.equals(self.r) else print(
"s dataframe"
)
if dvar is None:
# calculated for tidy printing
elen = np.max([len(e) for e in exps]) + 5
# find median increment in var
for e in exps:
evar = df[loc][var].to_numpy()
print(
" {:{}} {}_min= {:.2e} ; d{}= {:.2e}".format(
e,
elen,
var,
np.min(evar),
var,
np.median(np.diff(evar)),
)
)
ldvar = np.median(np.diff(df[loc][var].to_numpy()))
else:
ldvar = dvar
print(" Using d{}= {:.2e}".format(var, ldvar))
lvarmin = df[loc][var].min() if varmin is None else varmin
print(" Using {}_min= {:.2e}\n".format(var, lvarmin))
lvarmax = df[loc][var].max() if varmax is None else varmax
# define common var
cvar = np.arange(lvarmin, lvarmax, ldvar)
df.loc[loc, "common" + var] = df[loc][var].apply(
lambda x: cvar[np.argmin((x - cvar) ** 2)]
)
if figs:
plt.figure()
sl = np.linspace(
df[loc][var].min(), 1.05 * df[loc][var].max(), 100
)
plt.plot(sl, sl, alpha=0.4)
plt.plot(
df[loc][var].to_numpy(),
df[loc]["common" + var].to_numpy(),
".",
)
plt.xlabel(var)
plt.ylabel("common" + var)
title = (
"r dataframe" if df.equals(self.r) else "s dataframe"
)
plt.title(title)
plt.suptitle(
"comparing "
+ var
+ " with common"
+ var
+ " – the line y= x is expected"
)
plt.tight_layout()
plt.show()
###
# Routines to examine and ignore individual wells
###
[docs] def contentsofwells(self, wlist):
"""
Displays contents of wells
Parameters
----------
wlist: string or list of string
Specifies the well or wells of interest.
Examples
--------
>>> p.contentsofwells(['A1', 'E4'])
"""
wlist = gu.makelist(wlist)
for w in wlist:
print("\n" + w + "\n--")
print(
self.wellsdf.query("well == @w")
.drop(["well"], axis=1)
.to_string(index=False)
)
###
[docs] def showwells(
self,
concise=False,
sortby=False,
experiments="all",
conditions="all",
strains="all",
experimentincludes=False,
experimentexcludes=False,
conditionincludes=False,
conditionexcludes=False,
strainincludes=False,
strainexcludes=False,
):
"""
Displays wells for specified experiments, conditions, and strains.
Parameters
----------
concise: boolean
If True, display as experiment: condition: strain.
sortby: list of strings, optional
List of column names on which to sort the results.
experiments: string or list of strings
The experiments to include.
conditions: string or list of strings
The conditions to include.
strains: string or list of strings
The strains to include.
experimentincludes: string, optional
Selects only experiments that include the specified string in their
name.
experimentexcludes: string, optional
Ignores experiments that include the specified string in their
name.
conditionincludes: string, optional
Selects only conditions that include the specified string in their
name.
conditionexcludes: string, optional
Ignores conditions that include the specified string in their name.
strainincludes: string, optional
Selects only strains that include the specified string in their
name.
strainexcludes: string, optional
Ignores strains that include the specified string in their name.
Examples
--------
>>> p.showwells()
>>> p.showwells(strains= 'Mal12:GFP', conditions= '1% Mal')
"""
exps, cons, strs = sunder.getall(
self,
experiments,
experimentincludes,
experimentexcludes,
conditions,
conditionincludes,
conditionexcludes,
strains,
strainincludes,
strainexcludes,
nonull=False,
)
if not hasattr(self, "wellsdf"):
self.wellsdf = admin.makewellsdf(self.r)
df = self.wellsdf.query(
"experiment == @exps and condition == @cons and strain == @strs"
)
if sortby:
df = df.sort_values(by=gu.makelist(sortby))
print()
for e in exps:
if concise:
print(
df[["experiment", "condition", "strain"]]
.drop_duplicates()
.query("experiment == @e")
.to_string(index=False)
)
else:
print(df.query("experiment == @e").to_string(index=False))
print()
###
[docs] @clogger.log
def ignorewells(
self,
exclude=[],
experiments="all",
experimentincludes=False,
experimentexcludes=False,
clearall=False,
):
"""
Allows wells to be ignored in any future processing.
If called several times, the default behaviour is for any previously
ignored wells not to be re-instated.
Parameters
---------
exclude: list of strings
List of labels of wells on the plate to be excluded.
experiments: string or list of strings
The experiments to include.
experimentincludes: string, optional
Selects only experiments that include the specified string in their
name.
experimentexcludes: string, optional
Ignores experiments that include the specified string in their
name.
clearall: boolean
If True, all previously ignored wells are re-instated.
Example
-------
>>> p.ignorewells(['A1', 'C2'])
"""
if clearall:
# forget any previously ignoredwells
self.r = self.origr.copy()
self.progress["ignoredwells"] = {
exp: [] for exp in self.allexperiments
}
admin.update_s(self)
print(
"Warning: all corrections and analysis to raw data have been"
" lost. No wells have been ignored."
)
else:
if gu.islistempty(exclude):
return
else:
# exclude should be a list of lists
if isinstance(exclude, str):
exclude = [gu.makelist(exclude)]
elif isinstance(exclude[0], str):
exclude = [exclude]
# check consistency
if len(self.allexperiments) == 1:
exps = self.allexperiments
else:
exps = sunder.getexps(
self,
experiments,
experimentincludes,
experimentexcludes,
)
if len(exclude) != len(exps) and not clearall:
raise errors.IgnoreWells(
"Either a list of wells to exclude for a particular\n"
"experiment or a list of experiments must be given."
)
else:
# drop wells
for ex, exp in zip(exclude, exps):
# wells cannot be ignored twice
wex = list(
set(ex) - set(self.progress["ignoredwells"][exp])
)
# drop data from ignored wells
df = self.r
filt = (df["experiment"] == exp) & df["well"].isin(wex)
df = df.loc[~filt]
df = df.reset_index(drop=True)
self.r = df
# store ignoredwells
self.progress["ignoredwells"][exp] += ex
# remove any duplicates
self.progress["ignoredwells"][exp] = list(
set(self.progress["ignoredwells"][exp])
)
anycorrections = np.count_nonzero(
self.sc[
[
col
for col in self.sc.columns
if "correct" in col
]
].values
)
if np.any(anycorrections):
print(
"Warning: you have ignored wells after correcting\n"
"the data. It is best to ignorewells first, before\n"
"running any analysis."
)
# remake summary data
admin.update_s(self)
###
[docs] @clogger.log
def restricttime(self, tmin=None, tmax=None):
"""
Restrict the processed data to a range of time, ignoring points outside
this time range.
Note that data in the .s dataframe outside the time range is lost.
Exporting the dataframes before running restricttime is recommended.
Parameters
----------
tmin: float
The minimum value of time, with data kept only for t >= tmin.
tmax: float
The maximum value of time, with data kept only for t <= tmax.
Example
-------
>>> p.restricttime(tmin= 5)
"""
if tmin is None:
tmin = self.r.time.min()
if tmax is None:
tmax = self.r.time.max()
if tmax > tmin:
self.s = self.s[(self.s.time >= tmin) & (self.s.time <= tmax)]
else:
print("tmax or tmin is not properly defined")
###
# Routines for plotting
###
[docs] @clogger.log
def plot(
self,
x="time",
y="OD",
hue="strain",
style="condition",
size=None,
kind="line",
col=None,
row=None,
height=5,
aspect=1,
ymin=None,
figsize=False,
returnfacetgrid=False,
title=None,
plate=False,
wells=False,
nonull=False,
messages=False,
sortby=False,
experiments="all",
conditions="all",
strains="all",
experimentincludes=False,
experimentexcludes=False,
conditionincludes=False,
conditionexcludes=False,
strainincludes=False,
strainexcludes=False,
**kwargs,
):
"""
Plots from the underlying dataframes (chosen automatically) using
Seaborn's relplot, which is described at
https://seaborn.pydata.org/generated/seaborn.relplot.html
Parameters
----------
x: string
The variable - column of the dataframe - for the x-axis.
y: string
The variable - column of the dataframe - for y-axis.
hue: string
The variable whose variation will determine the colours of the
lines plotted. From Seaborn.
style: string
The variable whose variation will determine the style of each line.
From Seaborn.
size: string
The variable whose vairation will determine the size of each
marker. From Seaborn.
kind: string
Either 'line' or 'scatter', which determines the type of plot.
From Seaborn.
col: string, optional
The variable that varies over the columns in a multipanel plot.
From Seaborn.
row: string, optional
The variable that varies over the rows in a multipanel plot.
From Seaborn.
height: float, optional
The height of the individual panels in a multipanel plot.
From Seaborn.
aspect: float, optional
The aspect ratio of the individual panels in a multipanel plot.
From Seaborn.
ymin: float, optional
The minimum y-value
figsize: tuple, optional
A tuple of (width, height) for the size of figure.
Ignored if wells= True or plate= True.
returnfacetgrid: boolean, optional
If True, return Seaborn's facetgrid object created by relplot
title: float, optional
The title of the plot (overwrites any default titles).
plate: boolean, optional
If True, data for each well for a whole plate are plotted in one
figure.
wells: boolean, optional
If True, data for the individual wells is shown.
nonull: boolean, optional
If True, 'Null' strains are not plotted.
sortby: list of strings, optional
A list of columns to sort the data in the dataframe and passed to
pandas sort_values.
messsages: boolean, optional
If True, print warnings for any data requested but not found.
experiments: string or list of strings
The experiments to include.
conditions: string or list of strings
The conditions to include.
strains: string or list of strings
The strains to include.
experimentincludes: string, optional
Selects only experiments that include the specified string in
their name.
experimentexcludes: string, optional
Ignores experiments that include the specified string in their
name.
conditionincludes: string, optional
Selects only conditions that include the specified string in their
name.
conditionexcludes: string, optional
Ignores conditions that include the specified string in their name.
strainincludes: string, optional
Selects only strains that include the specified string in their
name.
strainexcludes: string, optional
Ignores strains that include the specified string in their name.
kwargs: for Seaborn's relplot
https://seaborn.pydata.org/generated/seaborn.relplot.html
Returns
-------
sfig: Seaborn's facetgrid object generated by relplot if
returnfacetgrid= True
Examples
--------
>>> p.plot(y= 'OD', plate= True)
>>> p.plot(y= 'OD', wells= True, strainincludes= 'Gal10:GFP')
>>> p.plot(y= 'OD')
>>> p.plot(x= 'OD', y= 'gr')
>>> p.plot(y= 'c-GFPperOD', nonull= True, ymin= 0)
>>> p.plot(y= 'c-GFPperOD', conditionincludes= '2% Mal',
... hue= 'strain')
>>> p.plot(y= 'c-mCherryperOD', conditions= ['0.5% Mal',
... '1% Mal'], hue= 'strain', style= 'condition',
... nonull= True, strainincludes= 'mCherry')
>>> p.plot(y= 'c-GFPperOD', col= 'experiment')
>>> p.plot(y= 'max gr')
"""
# choose the correct dataframe
basedf, dfname = omplot.plotfinddf(self, x, y)
# get experiments, conditions and strains
exps, cons, strs = sunder.getall(
self,
experiments,
experimentincludes,
experimentexcludes,
conditions,
conditionincludes,
conditionexcludes,
strains,
strainincludes,
strainexcludes,
nonull,
)
# choose the right type of plot
if plate:
dtype = y if x == "time" else x
omplot.plotplate(basedf, exps, dtype)
elif wells:
omplot.plot_wells(
x,
y,
basedf,
exps,
cons,
strs,
style,
size,
kind,
col,
row,
ymin,
title,
messages,
**kwargs,
)
elif dfname == "s" or dfname == "r":
omplot.plot_rs(
x,
y,
basedf,
exps,
cons,
strs,
hue,
style,
size,
kind,
col,
row,
height,
aspect,
ymin,
title,
figsize,
sortby,
returnfacetgrid,
**kwargs,
)
elif dfname == "sc":
omplot.plot_sc(
x,
y,
basedf,
exps,
cons,
strs,
hue,
style,
size,
kind,
col,
row,
height,
aspect,
ymin,
figsize,
title,
sortby,
**kwargs,
)
else:
raise errors.PlotError("No data found")
###
[docs] def savefigs(self, fname=None, onefile=True):
"""
Saves all current figures to PDF, either to one file or each to a
separate file.
Parameters
----------
fname: string, optional
Name of file. If unspecified, the name of the experiment is used.
onefile: boolean, optional
If False, each figures is save to its own PDF file.
Example
-------
>>> p.savefigs()
>>> p.savefigs('figures.pdf')
"""
if fname:
if ".pdf" not in fname:
fname += ".pdf"
fname = str(self.wdirpath / fname)
else:
fname = str(
self.wdirpath / ("".join(self.allexperiments) + ".pdf")
)
if onefile:
gu.figs2pdf(fname)
else:
for i in plt.get_fignums():
plt.figure(i)
savename = str(plt.getp(plt.gcf(), "axes")[0].title).split(
"'"
)[1]
savename = savename.replace(" ", "_")
if savename == "":
savename = "Whole_plate_Figure_" + str(i)
print("Saving", savename)
plt.savefig(str(self.wdirpath / (savename + ".pdf")))
###
@property
def close(self):
"""
Close all figures.
Example
-------
>>> p.close
"""
plt.close("all")
###
[docs] @clogger.log
def getdataframe(
self,
dfname="s",
experiments="all",
conditions="all",
strains="all",
experimentincludes=False,
experimentexcludes=False,
conditionincludes=False,
conditionexcludes=False,
strainincludes=False,
strainexcludes=False,
nonull=True,
):
"""
Obtain a subset of the data in a dataframe, which can be used plotting
directly.
Parameters
---------
dfname: string
The dataframe of interest either 'r' (raw data),
's' (default; processed data),
or 'sc' (summary statistics).
experiments: string or list of strings
The experiments to include.
conditions: string or list of strings
The conditions to include.
strains: string or list of strings
The strains to include.
experimentincludes: string, optional
Selects only experiments that include the specified string in their
name.
experimentexcludes: string, optional
Ignores experiments that include the specified string in their
name.
conditionincludes: string, optional
Selects only conditions that include the specified string in their
name.
conditionexcludes: string, optional
Ignores conditions that include the specified string in their name.
strainincludes: string, optional
Selects only strains that include the specified string in their
name.
strainexcludes: string, optional
Ignores strains that include the specified string in their name.
nonull: boolean, optional
If True, ignore 'Null' strains
Returns
-------
ndf: dataframe
Example
-------
>>> ndf= p.getdataframe('s', conditions= ['2% Glu'],
... nonull= True)
"""
exps, cons, strs = sunder.getall(
self,
experiments,
experimentincludes,
experimentexcludes,
conditions,
conditionincludes,
conditionexcludes,
strains,
strainincludes,
strainexcludes,
nonull,
)
if hasattr(self, dfname):
df = getattr(self, dfname)
ndf = df.query(
"experiment == @exps and condition == @cons "
"and strain == @strs"
)
if ndf.empty:
print("No data found")
else:
return ndf.copy()
else:
raise errors.UnknownDataFrame(
"Dataframe " + dfname + " is not recognised"
)
###
# OD correction
###
[docs] @clogger.log
def correctOD(
self,
figs=True,
odmatch=0.3,
experiments="all",
experimentincludes=False,
experimentexcludes=False,
conditions="all",
conditionincludes=False,
conditionexcludes=False,
):
"""
Corrects OD data for the non-linear relationship between OD and cell
number.
Requires a set of dilution data set, with the default being haploid
yeast growing in glucose (collected by L Bandiera).
An alternative can be loaded from a file - a txt file of two columns
with OD specified in the first column and the dilution factor specified
in descending order in the second.
Parameters
---------
figs: boolean, optional
If True, a plot of the fit to the dilution data is produced.
odmatch: float, optional
If non-zero, then the corrected OD is rescaled to equal the
measured OD at this value. Only large ODs typically need to be
corrected.
experiments: string or list of strings
The experiments to include.
conditions: string or list of strings
The conditions to include.
experimentincludes: string, optional
Selects only experiments that include the specified string in their
name.
experimentexcludes: string, optional
Ignores experiments that include the specified string in their
name.
conditionincludes: string, optional
Selects only conditions that include the specified string in their
name.
conditionexcludes: string, optional
Ignores conditions that include the specified string in their name.
Examples
-------
>>> p.correctOD()
>>> p.correctOD(figs= False)
"""
exps = sunder.getexps(
self, experiments, experimentincludes, experimentexcludes
)
cons = sunder.getcons(
self,
conditions,
conditionincludes,
conditionexcludes,
nomedia=False,
)
for exp in exps:
for c in cons:
if self.sc[
(self.sc.experiment == exp) & (self.sc.condition == c)
]["OD corrected"].any():
print(exp, ": OD is already corrected for", c)
else:
# fit dilution data
if not self.progress["gc"][exp]:
ODfname = (
str(self.wdirpath / self.progress["ODfname"][exp])
if self.progress["ODfname"][exp]
else None
)
gc = corrections.findODcorrection(
self.wdirpath,
ODfname,
exp,
figs,
odmatch,
)
self.progress["gc"][exp] = gc
# copy gc to experiments with the same ODfname
for e in self.allexperiments:
if (
self.progress["ODfname"][e]
== self.progress["ODfname"][exp]
):
self.progress["gc"][e] = gc
# correct all wells
gc.batchpredict(
self.r.query("experiment == @exp and condition == @c")[
"OD"
].to_numpy()
)
# update r dataframe
self.r.loc[
(self.r.experiment == exp) & (self.r.condition == c),
"OD",
] = gc.f
# flag corrections in summary stats dataframe
self.sc.loc[
(self.sc.experiment == exp) & (self.sc.condition == c),
"OD corrected",
] = True
# update s dataframe
admin.update_s(self)
###
# Media correction
###
###
# Statistical analysis
###
[docs] @clogger.log
def getstats(
self,
dtype="OD",
bd=False,
cvfn="matern",
empirical_errors=False,
noruns=10,
exitearly=True,
noinits=100,
nosamples=100,
logs=True,
iskip=False,
stats=True,
figs=True,
findareas=False,
plotlocalmax=True,
showpeakproperties=False,
experiments="all",
experimentincludes=False,
experimentexcludes=False,
conditions="all",
conditionincludes=False,
conditionexcludes=False,
strains="all",
strainincludes=False,
strainexcludes=False,
**kwargs,
):
"""
Calls fitderiv.py to estimate the first and second time-derivatives of,
typically, OD using a Gaussian process (Swain et al., 2016) and find
corresponding summary statistics.
The derivatives are stored in the .s dataframe; summary statistics are
stored in the .sc dataframe.
Parameters
----------
dtype: string, optional
The type of data - 'OD', 'GFP', 'c-GFPperOD', or 'c-GFP' - for
which the derivatives are to be found. The data must exist in the
.r or .s dataframes.
bd: dictionary, optional
The bounds on the hyperparameters for the Gaussian process.
For example, bd= {1: [-2,0])} fixes the bounds on the
hyperparameter controlling flexibility to be 1e-2 and 1e0.
The default for a Matern covariance function
is {0: (-5,5), 1: (-4,4), 2: (-5,2)},
where the first element controls amplitude, the second controls
flexibility, and the third determines the magnitude of the
measurement error.
cvfn: string, optional
The covariance function used in the Gaussian process, either
'matern' or 'sqexp' or 'nn'.
empirical_errors: boolean, optional
If True, measurement errors are empirically estimated from the
variance across replicates at each time point and so vary with
time.
If False, the magnitude of the measurement error is fit from the
data assuming that this magnitude is the same at all time points.
noruns: integer, optional
The number of attempts made for each fit. Each attempt is made
with random initial estimates of the hyperparameters within their
bounds.
exitearly: boolean, optional
If True, stop at the first successful fit.
If False, use the best fit from all successful fits.
noinits: integer, optional
The number of random attempts to find a good initial condition
before running the optimization.
nosamples: integer, optional
The number of samples used to calculate errors in statistics by
bootstrapping.
logs: boolean, optional
If True, find the derivative of the log of the data and should be
True to determine the specific growth rate when dtype= 'OD'.
iskip: integer, optional
Use only every iskip'th data point to increase speed.
stats: boolean, optional
If False, do not calculate statistics.
figs: boolean, optional
If True, plot both the fits and inferred derivative.
findareas: boolean, optional
If True, find the area under the plot of gr vs OD and the area
under the plot of OD vs time. Setting to True can make getstats
slow.
plotlocalmax: boolean, optional
If True, mark the highest local maxima found, which is used to
calculate statistics, on any plots.
showpeakproperties: boolean, optional
If True, show properties of any local peaks that have found by
scipy's find_peaks. Additional properties can be specified as
kwargs and are passed to find_peaks.
experiments: string or list of strings
The experiments to include.
conditions: string or list of strings
The conditions to include.
strains: string or list of strings
The strains to include.
experimentincludes: string, optional
Selects only experiments that include the specified string in their
name.
experimentexcludes: string, optional
Ignores experiments that include the specified string in their
name.
conditionincludes: string, optional
Selects only conditions that include the specified string in their
name.
conditionexcludes: string, optional
Ignores conditions that include the specified string in their name.
strainincludes: string, optional
Selects only strains that include the specified string in their
name.
strainexcludes: string, optional
Ignores strains that include the specified string in their name.
kwargs: for scipy's find_peaks
To set the minimum property of a peak. e.g. prominence= 0.1 and
width= 15 (specified in numbers of x-points or y-points and not
real units).
https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.find_peaks.html
Examples
--------
>>> p.getstats()
>>> p.getstats(conditionincludes= 'Gal')
>>> p.getstats(noruns= 10, exitearly= False)
If the fits are poor, often changing the bounds on the hyperparameter
for the measurement error helps:
>>> p.getstats(bd= {2: (-3,0)})
References
----------
PS Swain, K Stevenson, A Leary, LF Montano-Gutierrez, IB Clark,
J Vogel, T Pilizota. (2016). Inferring time derivatives including cell
growth rates using Gaussian processes. Nat Commun, 7, 1-8.
"""
linalgmax = 5
warnings = ""
if dtype == "OD" and logs:
derivname = "gr"
else:
derivname = "d/dt " + dtype
snames = [
"max " + derivname,
"time of max " + derivname,
]
if dtype == "OD" and logs:
# special names with estimating specific growth rate
snames += ["doubling time", "lag time"]
else:
snames += [
"doubling time from " + derivname,
"lag time from " + derivname,
]
if logs:
ylabels = ["log(" + dtype + ")", derivname]
else:
ylabels = [dtype, derivname]
# extract data
exps, cons, strs = sunder.getall(
self,
experiments,
experimentincludes,
experimentexcludes,
conditions,
conditionincludes,
conditionexcludes,
strains,
strainincludes,
strainexcludes,
)
# find growth rate and stats
for e in exps:
for c in cons:
for s in strs:
figtitle = e + ": " + s + " in " + c
if dtype in self.r.columns:
# raw data
d = sunder.extractwells(self.r, self.s, e, c, s, dtype)
t = self.s.query(
"experiment == @e and condition == @c and "
"strain == @s"
)["time"].to_numpy()
elif dtype in self.s.columns:
# processed data
df = self.s.query(
"experiment == @e and condition == @c and "
"strain == @s"
)
# add columns plus and minus err
df = omplot.augmentdf(df, dtype)[
[dtype, "augtype", "time"]
]
piv_df = df.pivot("time", "augtype", dtype)
# convert to array for fitderiv
d = piv_df.values
t = piv_df.index.to_numpy()
numberofnans = np.count_nonzero(np.isnan(d))
if np.any(numberofnans):
print(f"\nWarning: {numberofnans} NaNs in data")
else:
print(dtype, "not recognized for", figtitle)
return
# checks
if d.size == 0:
# no data
print("No data found for", dtype, "for", figtitle)
break
if logs:
print("\nFitting log(" + dtype + ") for", figtitle)
else:
print("\nFitting", dtype, "for", figtitle)
# call fitderiv
f = fitderiv(
t,
d,
cvfn=cvfn,
logs=logs,
bd=bd,
empirical_errors=empirical_errors,
statnames=snames,
noruns=noruns,
noinits=noinits,
exitearly=exitearly,
linalgmax=linalgmax,
nosamples=nosamples,
iskip=iskip,
)
if f.success:
if figs:
plt.figure()
plt.subplot(2, 1, 1)
f.plotfit(
"f", ylabel=ylabels[0], figtitle=figtitle
)
axgr = plt.subplot(2, 1, 2)
f.plotfit("df", ylabel=ylabels[1])
plt.tight_layout()
# find summary statistics
outdf, statsdict, warning = omstats.findsummarystats(
dtype,
derivname,
logs,
nosamples,
f,
t,
e,
c,
s,
findareas,
figs,
plotlocalmax,
axgr,
showpeakproperties,
**kwargs,
)
if warning:
warnings += warning
# store results in instance's dataframes
statsdict[
"logmaxlikehood for " + derivname
] = f.logmaxlike
statsdict["gp for " + derivname] = cvfn
if stats:
for sname in f.ds.keys():
statsdict[sname] = f.ds[sname]
# add growth rates, etc., to dataframe of summary data
if derivname not in self.s.columns:
# add new columns to dataframe
self.s = pd.merge(self.s, outdf, how="outer")
else:
# update dataframe
self.s = gu.absorbdf(
self.s,
outdf,
["experiment", "condition", "strain", "time"],
)
# create or add summary stats to stats dataframe
statsdf = pd.DataFrame(
statsdict, index=pd.RangeIndex(0, 1, 1)
)
newstats = np.count_nonzero(
[
True if stat not in self.sc.columns else False
for stat in statsdict
]
)
if newstats:
# add new columns to dataframe
self.sc = pd.merge(self.sc, statsdf, how="outer")
else:
# update dataframe
self.sc = gu.absorbdf(
self.sc,
statsdf,
["experiment", "condition", "strain"],
)
if figs:
plt.show()
omstats.cleansc(self)
if warnings:
print(warnings)
###
[docs] @clogger.log
def averageoverexpts(
self,
condition,
strain,
tvr="OD mean",
bd=False,
addnoise=True,
plot=False,
):
"""
Uses a Matern Gaussian process to average a time-dependent variable
over all experiments.
An alternative and best first choice is to use addcommonvar.
Parameters
----------
condition: string
The condition of interest.
strain: string
The strain of interest.
tvr: float
The time-dependent variable to be averaged.
For example, 'c-GFPperOD' or 'OD mean'.
bd: dictionary, optional
The limits on the hyperparameters for the Matern Gaussian process.
For example, {0: (-5,5), 1: (-4,4), 2: (-5,2)}
where the first element controls amplitude, setting the bounds to
1e-5 and 1e5, the second controls flexibility, and the third
determines the magnitude of the measurement error.
addnoise: boolean
If True, add the fitted magnitude of the measurement noise to the
predicted standard deviation for better comparison with the spread
of the data.
Returns
-------
res: dictionary
{'t' : time, tvr : time-dependent data, 'mn' : mean,
'sd' : standard deviation}
where 'mn' is the average found and 'sd' is its standard deviation.
'tvr' is the data used to find the average.
Examples
--------
>>> p.averageoverexpts('1% Gal', 'GAL2', bd= {1: [-1,-1])})
"""
# boundaries on hyperparameters
if "OD" in tvr:
bds = {0: (-4, 4), 1: (-1, 4), 2: (-6, 2)}
else:
bds = {0: (2, 12), 1: (-1, 4), 2: (4, 10)}
if bd:
bds = gu.mergedicts(original=bds, update=bd)
# extract data
df = self.s[["experiment", "condition", "strain", "time", tvr]]
ndf = df.query("condition == @condition and strain == @strain")
# use GP to average over experiments
x = ndf["time"].to_numpy()
y = ndf[tvr].to_numpy()
ys = y[np.argsort(x)]
xs = np.sort(x)
g = gp.maternGP(bds, xs, ys)
print(
"averaging over", tvr, "experiments for", strain, "in", condition
)
g.findhyperparameters(noruns=2, noinits=1000)
g.results()
g.predict(xs, addnoise=addnoise)
if plot:
plt.figure()
g.sketch(".")
plt.title("averaging " + strain + " in " + condition)
plt.xlabel("time")
plt.ylabel(tvr)
plt.show()
# return results as a dictionary
res = {"t": xs, tvr: ys, "mn": g.f, "sd": np.sqrt(g.fvar)}
return res
###
# Fluorescence corrections
###
[docs] @clogger.log
def correctauto(
self,
f=["GFP", "AutoFL"],
refstrain="WT",
figs=True,
experiments="all",
experimentincludes=False,
experimentexcludes=False,
conditions="all",
conditionincludes=False,
conditionexcludes=False,
strains="all",
strainincludes=False,
strainexcludes=False,
):
"""
Corrects fluorescence data for autofluorescence by comparing with the
fluorescence of an untagged reference strain.
The reference strain is used to estimate the autofluoresence via either
the method of Licthen et al., 2014, where measurements of fluoescence
at two wavelengths is required, or by using the fluorescence of the
reference strain interpolated to the OD of the strain of interest
(Berthoumieux et al., 2013).
Using two measurements of fluorescence is thought to be more accurate,
particularly for low fluorescence measurements (Mihalcescu et al.,
2015).
Arguments
--
f: string or list of strings
The fluorescence measurements, typically either ['mCherry'] or
['GFP', 'AutoFL'].
refstrain: string
The reference strain.
figs: boolean
If True, display plots showing the fits to the reference strain's
fluorescnce.
experiments: string or list of strings
The experiments to include.
conditions: string or list of strings
The conditions to include.
strains: string or list of strings
The strains to include.
experimentincludes: string, optional
Selects only experiments that include the specified string in
their name.
experimentexcludes: string, optional
Ignores experiments that include the specified string in their
name.
conditionincludes: string, optional
Selects only conditions that include the specified string in their
name.
conditionexcludes: string, optional
Ignores conditions that include the specified string in their name.
strainincludes: string, optional
Selects only strains that include the specified string in their
name.
strainexcludes: string, optional
Ignores strains that include the specified string in their name.
Notes
-----
In principle
>>> p.correctmedia()
should be run before running correctauto when processing data with two
fluorescence measurements.
It is unnecessary with only one fluorescence measurement because the
normalisation is then done directly with the reference strain's
fluorescence and this fluorescence can include the fluorescence from
the media.
In practice, running correctmedia may generate negative values of the
fluorescence at some time points. These negative values will create
NaNs in the corrected fluorescence, which are normally harmless.
With sufficiently many negative values of the fluorescence, however,
correcting data with two fluorescence measurements can become
corrupted.
If correctmedia generates negative fluorescence values, we therefore
recommend comparing the corrected fluorescence between
>>> p.correctmedia()
>>> p.correctauto(['GFP', 'AutoFL')
and
>>> p.correctauto('GFP')
to determine if these negative values are deleterious.
Examples
--------
To correct data with one type of fluorescence measurement, use:
>>> p.correctauto('GFP')
>>> p.correctauto('mCherry', refstrain= 'BY4741')
To correct data with two types of fluorescence measurement, use:
>>> p.correctauto(['GFP', 'AutoFL'])
>>> p.correctauto(['GFP', 'AutoFL'], refstrain= 'wild-type')
References
----------
S Berthoumieux, H De Jong, G Baptist, C Pinel, C Ranquet, D Ropers,
J Geiselmann (2013).
Shared control of gene expression in bacteria by transcription factors
and global physiology of the cell.
Mol Syst Biol, 9, 634.
CA Lichten, R White, IB Clark, PS Swain (2014).
Unmixing of fluorescence spectra to resolve quantitative time-series
measurements of gene expression in plate readers.
BMC Biotech, 14, 1-11.
I Mihalcescu, MVM Gateau, B Chelli, C Pinel, JL Ravanat (2015).
Green autofluorescence, a double edged monitoring tool for bacterial
growth and activity in micro-plates.
Phys Biol, 12, 066016.
"""
f = gu.makelist(f)
exps, cons, strs = sunder.getall(
self,
experiments,
experimentincludes,
experimentexcludes,
conditions,
conditionincludes,
conditionexcludes,
strains,
strainincludes,
strainexcludes,
)
# check for negative fluorescence values
for e in exps:
for c in cons:
if self.progress["negativevalues"][e]:
for datatype in f:
if (
datatype in self.progress["negativevalues"][e]
and c in self.progress["negativevalues"][e]
):
print(
e + ": The negative values for",
datatype,
"in",
c,
"will generate NaNs",
)
# going ahead
print("Using", refstrain, "as the reference")
# correct for autofluorescence
if len(f) == 2:
corrections.correctauto2(
self,
f,
refstrain,
figs,
experiments,
experimentincludes,
experimentexcludes,
conditions,
conditionincludes,
conditionexcludes,
strains,
strainincludes,
strainexcludes,
)
elif len(f) == 1:
corrections.correctauto1(
self,
f,
refstrain,
figs,
experiments,
experimentincludes,
experimentexcludes,
conditions,
conditionincludes,
conditionexcludes,
strains,
strainincludes,
strainexcludes,
)
else:
print("f must be a list of length 1 or 2")
###
# Logging
###
@property
def log(self):
"""
Prints a log of all methods called and their arguments.
Example
-------
>>> p.log
"""
print(self.logstream.getvalue())
###
[docs] def savelog(self, fname=None):
"""
Save log to file.
Parameters
--
fname: string, optional
The name of the file. If unspecified, the name of the experiment.
Example
-------
>>> p.savelog()
"""
# export log
if fname:
fnamepath = self.wdirpath / (fname + ".log")
else:
fnamepath = self.wdirpath / ("".join(self.allexperiments) + ".log")
with fnamepath.open("w") as f:
f.write(self.logstream.getvalue())
print("Exported to", str(fnamepath))
###
# Exporting and importing
###
[docs] @clogger.log
def exportdf(self, commonname=False, type="tsv"):
"""
Exports the dataframes as either tab-delimited or csv or json files.
Dataframes for the (processed) raw data, for summary data, and for
summary statistics and corrections, as well as a log file, are
exported.
Parameters
----------
commonname: string, optional
The name used for the output files.
If unspecified, the experiment or experiments is used.
type: string
The type of file for export, either 'json' or 'csv' or 'tsv'.
Examples
--------
>>> p.exportdf()
>>> p.exportdf('processed', type= 'json')
"""
if commonname:
fullcommonname = str(self.wdirpath / commonname)
else:
fullcommonname = str(self.wdirpath / "".join(self.allexperiments))
# export data
if type == "json":
self.r.to_json(fullcommonname + "_r.json", orient="split")
self.s.to_json(fullcommonname + "_s.json", orient="split")
self.sc.to_json(fullcommonname + "_sc.json", orient="split")
else:
sep = "\t" if type == "tsv" else ","
self.r.to_csv(fullcommonname + "_r." + type, sep=sep, index=False)
self.s.to_csv(fullcommonname + "_s." + type, sep=sep, index=False)
self.sc.to_csv(
fullcommonname + "_sc." + type, sep=sep, index=False
)
# export log to file
self.savelog(commonname)
###
[docs] @clogger.log
def importdf(self, commonnames, info=True, sep="\t"):
"""
Import dataframes saved as either json or csv or tsv files.
Parameters
----------
commonnames: list of strings
A list of names for the files to be imported with one string for
each experiment.
Examples
--------
>>> p.importdf('Gal')
>>> p.importdf(['Gal', 'Glu', 'Raf'])
"""
commonnames = gu.makelist(commonnames)
# import data
for commonname in commonnames:
commonname = str(self.wdirpath / commonname)
for df in ["r", "s", "sc"]:
try:
# json files
exec(
"impdf= pd.read_json(commonname + '_' + df + "
"'.json', orient= 'split')"
)
print("Imported", commonname + "_" + df + ".json")
except ValueError:
try:
# csv files
exec(
"impdf= pd.read_csv(commonname + '_' + df + "
"'.csv', sep= ',')"
)
print("Imported", commonname + "_" + df + ".csv")
except FileNotFoundError:
try:
# tsv files
exec(
"impdf= pd.read_csv(commonname + '_' + df "
"+ '.tsv', sep= '\t')"
)
print("Imported", commonname + "_" + df + ".tsv")
except FileNotFoundError:
print(
"No file called",
commonname
+ "_"
+ df
+ ".json or .csv or .tsv found",
)
return
# ensure all are imported as strings
for var in ["experiment", "condition", "strain"]:
exec("impdf[var]= impdf[var].astype(str)")
# merge dataframes
if hasattr(self, df):
exec(
"self."
+ df
+ "= pd.merge(self."
+ df
+ ", impdf, how= 'outer')"
)
else:
exec("self." + df + "= impdf")
print()
# update attributes
self.allexperiments = list(self.s.experiment.unique())
self.allconditions.update(
{
e: list(self.s[self.s.experiment == e].condition.unique())
for e in self.allexperiments
}
)
self.allstrains.update(
{
e: list(self.s[self.s.experiment == e].strain.unique())
for e in self.allexperiments
}
)
# find datatypes with mean in self.s
dtypdict = {}
for e in self.allexperiments:
# drop columns of NaNs - these are created by merge if a datatype
# is in one experiment but not in another
tdf = self.s[self.s.experiment == e].dropna(axis=1, how="all")
dtypdict[e] = list(tdf.columns[tdf.columns.str.contains("mean")])
self.datatypes.update(
{e: [dt.split(" mean")[0] for dt in dtypdict[e]] for e in dtypdict}
)
# initialise progress
for e in self.allexperiments:
admin.initialiseprogress(self, e)
# display info on import
if info:
self.info
# display warning if duplicates created
if len(self.allexperiments) != np.unique(self.allexperiments).size:
print(
"\nLikely ERROR: data with the same experiment, condition, "
"strain, and time now appears twice!!"
)
###
if __name__ == "__main__":
print(platereader.__doc__)