# Source code for imagepypelines.core.ml_tools

```# @Email: jmaggio14@gmail.com
# @Website: https://www.imagepypelines.org/
# @License: https://github.com/jmaggio14/imagepypelines/blob/master/LICENSE
# @github: https://github.com/jmaggio14/imagepypelines
#
# Copyright (c) 2018-2019 Jeff Maggio, Nathan Dileas, Ryan Hartzell
import numpy as np
from itertools import islice, chain
import scipy.stats
import random
import math

[docs]def accuracy(predicted,ground_truth):
"""calculates accuracy given ground truth and predicted labels"""
num_correct = np.sum(np.asarray(predicted) == np.asarray(ground_truth))
return float(num_correct) / len(predicted)

[docs]def confidence_99(data):
"""returns the 99% confidence mean and deviation for the given
distribution

Args:
data(array-like): data to find the confidence interval for,
in machine learning applications, this is usually accuracy
for K-fold cross validation

Returns:
float: the mean for this distributions
float: +/- deviation for this confidence interval

Example:
>>> import numpy as np
>>> import imagepypelines as ip
>>> # create sample test 'accuracies' from a normal distribution
>>> # mean accuracy is 75%, std is 10% for this example
>>> accuracies = np.random.normal(.75, .1, 1000)
>>> # get 99% confidence interval
>>> mean, error = ip.confidence_99(accuracies)
"""
return confidence(data,.99)

[docs]def confidence_95(data):
"""returns the 95% confidence mean and deviation for the given
distribution

Args:
data(array-like): data to find the confidence interval for,
in machine learning applications, this is usually accuracy
for K-fold cross validation

Returns:
float: the mean for this distributions
float: +/- deviation for this confidence interval

Example:
>>> import numpy as np
>>> import imagepypelines as ip
>>> # create sample test 'accuracies' from a normal distribution
>>> # mean accuracy is 75%, std is 10% for this example
>>> accuracies = np.random.normal(.75, .1, 1000)
>>> # get 95% confidence interval
>>> mean, error = ip.confidence_95(accuracies)
"""
return confidence(data,.95)

[docs]def confidence_90(data):
"""returns the 90% confidence mean and deviation for the given
distribution

Args:
data(array-like): data to find the confidence interval for,
in machine learning applications, this is usually accuracy
for K-fold cross validation

Returns:
float: the mean for this distributions
float: +/- deviation for this confidence interval

Example:
>>> import numpy as np
>>> import imagepypelines as ip
>>> # create sample test 'accuracies' from a normal distribution
>>> # mean accuracy is 75%, std is 10% for this example
>>> accuracies = np.random.normal(.75, .1, 1000)
>>> # get 90% confidence interval
>>> mean, error = ip.confidence_90(accuracies)
"""
return confidence(data,.90)

[docs]def confidence(data, confidence=0.95):
"""returns the confidence mean and deviation for the given
confidence interval

Args:
data(array-like): data to find the confidence interval for,
in machine learning applications, this is usually accuracy
for K-fold cross validation
confidence(float): confidence interval between 0-1, to find
the desired mean and deviation for

Returns:
float: the mean for this distributions
float: +/- deviation for this confidence interval

Example:
>>> import numpy as np
>>> import imagepypelines as ip
>>> # create sample test 'accuracies' from a normal distribution
>>> # mean accuracy is 75%, std is 10% for this example
>>> accuracies = np.random.normal(.75, .1, 1000)
>>> # get 95% confidence interval
>>> mean, error = ip.confidence(accuracies,.95)
"""
data = np.asarray(data,dtype=np.float32)
# calculate mean and standard error of measurement
m, se = np.mean(data), scipy.stats.sem(data)
# find error using the percent point function and standard error
h = se * scipy.stats.t.ppf((1 + confidence) / 2.0, len(data)-1)
return m, h

[docs]def chunk(data,n):
"""chunk a list into n chunks"""
chunk_size = math.ceil( len(data) / n )
return batch(data, chunk_size)

[docs]def batch(data, batch_size):
"""chunks a list into multiple batch_size chunks, the last batch will
be truncated if the data length isn't a multiple of batch_size
"""
data = iter(data)
return list(iter( lambda: list(islice(data, batch_size)), []) )

[docs]def chunks2list(batches):
"""turns nested iterables into a single list"""
return list( chain(*batches) )

[docs]def xsample(data,sample_fraction):
"""function to randomly sample list data using a uniform distribution
"""
assert isinstance(data,list),"data must be a list"
assert sample_fraction >= 0 and sample_fraction <= 1,\
"sample_fraction must be a float between 0 and 1"

n = int(sample_fraction * len(data))
sampled = random.sample(data,n)
return sampled

[docs]def xysample(data,labels,sample_fraction=.05):
"""function to randomly sample list data and corresponding labels using a uniform
distribution

Example:
>>> import random
>>> random.seed(0)
>>> import imagepypelines as ip
>>> data = [0,1,2,3,4,5,6,7,8,9]
>>> labels = ['0','1','2','3','4','5','6','7','8','9']
>>>
>>> small_data, small_labels = ip.xysample(data,labels,.2)
"""
assert isinstance(data,list),"data must be a list"
assert isinstance(labels,list),"labels must be a list"
assert len(data) == len(labels), \
"you must have an equal number of data and labels"
assert min(0,sample_fraction) == 0 and max(1,sample_fraction) == 1,\
"sample_fraction must be a float between 0 and 1"

combined = list( zip(data, labels) )
n = int(sample_fraction * len(data))
sampled = random.sample(combined,n)
sampled_data, sampled_labels = zip(*sampled)
return list(sampled_data), list(sampled_labels)
```