Source code for pyrltr.analyzer.DataAnalyzer

import os
import math
import numpy as np


[docs]class DataAnalyzer: def __init__(self, name, steplength=50, groupsize=50): self.files = filter(self.isDataFile, os.listdir(name)) self.files = map(lambda f: "%s/%s" % (name, f), self.files) self.files.sort() self.steplength = steplength self.groupSize = groupsize
[docs] def isDataFile(self, filename): return filename.endswith(".dat") or filename.endswith(".dat.gz")
[docs] def expandData(self, data, computeSum=False): """Expands the short datasets with nans to have a uniform length. data has to have the form [dataset0, dataset1, ...] where each dataset is a 1d array/list of different lengths.""" # get maximal length shapes = map(np.shape, data) maxShapes = np.max(shapes) # compute difference for every dataset differences = map(lambda x: maxShapes - len(x), data) # expand short datasets with nans for the missing values expansions = map(lambda difference: np.zeros((difference)), differences) # fill works through side effects map(lambda x: x.fill(None), expansions) dataExpansions = zip(data, expansions) # print "dataExpansions ", dataExpansions return map(lambda x: np.concatenate(x, axis=0), dataExpansions)
[docs] def filterNAN(self, datasets): def f(dataset): # helper function return filter(lambda x: not math.isnan(x), dataset) return map(f, datasets)
[docs] def loadData(self, filename): print filename return np.loadtxt(filename)
[docs] def filterFiles(self, f): files = filter(f, self.files) files.sort() return files
[docs] def computeGroupSums(self, x): x = np.array(x) groups = x.reshape((x.shape[0] / self.groupSize, self.groupSize)) return np.sum(groups, axis=1)
[docs] def computeMeanStdOfFiles(self, dataSets, computeSum=False, computeGroupSums=False): """Computes the mean values and standard deviations of the values within the given files.""" # dataSets = self.expandData(dataSets, computeSum=computeSum) if computeGroupSums: dataSets = map(self.computeGroupSums, dataSets) dataSets = zip(*dataSets) dataSets = self.filterNAN(dataSets) meanValues = map(np.mean, dataSets)[::self.steplength] stddevs = map(np.std, dataSets)[::self.steplength] indexes = np.array(xrange(len(meanValues))) * self.steplength return meanValues, stddevs, indexes