This page lists ready to run shogun examples for the Python Modular interface.
To run the examples issue
python name_of_example.py
# This example shows how to use a custom defined kernel function for training a # two class Support Vector Machine (SVM) classifier on a randomly generated # examples. The SVM regularization constant is set to C=1. from numpy import * from numpy.random import rand from shogun.Features import RealFeatures, Labels from shogun.Kernel import CustomKernel from shogun.Classifier import LibSVM C=1 dim=7 lab=sign(2*rand(dim) - 1) data=rand(dim, dim) symdata=data*data.T kernel=CustomKernel() kernel.set_full_kernel_matrix_from_full(data) labels=Labels(lab) svm=LibSVM(C, kernel, labels) svm.train() out=svm.classify().get_labels()
# In this example we demonstrate how to use SVMs in a domain adaptation # scenario. Here, we assume that we have two problem domains, one with # an abundance of training data (source domain) and one with only a few # training examples (target domain). These domains are assumed to be # different but related enough to transfer information between them. # Thus, we first train an SVM on the source domain and then subsequently # pass this previously trained SVM object to the DASVM, that we train # on the target domain. The DASVM internally computes a custom linear term # (for the underlying quadratic program of the dual formulation of the SVM) # based on the support vectors of the source SVM and the training examples # of the target SVM. Finally, it can be used for prediction just as any other # SVM object. # import numpy from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel from shogun.Classifier import SVMLight, DomainAdaptationSVM degree=3 fm_train_dna = ['CGCACGTACGTAGCTCGAT', 'CGACGTAGTCGTAGTCGTA', 'CGACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACCACAGTTATATAGTA', 'CGACGTAGTCGTAGTCGTA', 'CGACGTAGTTTTTTTCGTA', 'CGACGTAGTCGTAGCCCCA', 'CAAAAAAAAAAAAAAAATA', 'CGACGGGGGGGGGGGCGTA'] label_train_dna = numpy.array(5*[-1.0] + 5*[1.0]) fm_test_dna = ['AGCACGTACGTAGCTCGAT', 'AGACGTAGTCGTAGTCGTA', 'CAACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGAACACAGTTATATAGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACGTGGGGTTTTTCGTA', 'CGACGTAGTCCCAGCCCCA', 'CAAAAAAAAAAAACCAATA', 'CGACGGCCGGGGGGGCGTA'] label_test_dna = numpy.array(5*[-1.0] + 5*[1.0]) fm_train_dna2 = ['AGACAGTCAGTCGATAGCT', 'AGCAGTCGTAGTCGTAGTC', 'AGCAGGGGGGGGGGTAGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCAACACGTTCTCTCGTC', 'AGCAGTCGTAGTCGTAGTC', 'AGCAGTCGTTTTTTTAGTC', 'AGCAGTCGTAGTCGAAAAC', 'ACCCCCCCCCCCCCCCCTC', 'AGCAGGGGGGGGGGGAGTC'] label_train_dna2 = numpy.array(5*[-1.0] + 5*[1.0]) fm_test_dna2 = ['CGACAGTCAGTCGATAGCT', 'CGCAGTCGTAGTCGTAGTC', 'ACCAGGGGGGGGGGTAGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCCACACGTTCTCTCGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCAGTGGGGTTTTTAGTC', 'AGCAGTCGTAAACGAAAAC', 'ACCCCCCCCCCCCAACCTC', 'AGCAGGAAGGGGGGGAGTC'] label_test_dna2 = numpy.array(5*[-1.0] + 5*[1.0]) C = 1.0 feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = Labels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.train() ##################################### print "obtaining DA SVM from previously trained SVM" feats_train2 = StringCharFeatures(fm_train_dna, DNA) feats_test2 = StringCharFeatures(fm_test_dna, DNA) kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels2 = Labels(label_train_dna) # we regularize against the previously obtained solution dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0) dasvm.train() out = dasvm.classify(feats_test2).get_labels() print out
# In this example a multi-class support vector machine is trained on a toy data
# set and the trained classifier is then used to predict labels of test
# examples. The training algorithm is based on BSVM formulation (L2-soft margin
# and the bias added to the objective function) which is solved by the Improved
# Mitchell-Demyanov-Malozemov algorithm. The training algorithm uses the Gaussian
# kernel of width 2.1 and the regularization constant C=1. The solver stops if the
# relative duality gap falls below 1e-5.
#
# For more details on the used SVM solver see
# V.Franc: Optimization Algorithms for Kernel Methods. Research report.
# CTU-CMP-2005-22. CTU FEL Prague. 2005.
# ftp://cmp.felk.cvut.cz/pub/cmp/articles/franc/Franc-PhD.pdf .
#
def gmnpsvm ():
print 'GMNPSVM'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import GMNPSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_multiclass)
svm=GMNPSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train(feats_train)
#kernel.init(feats_train, feats_test)
out=svm.classify(feats_test).get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat')
gmnpsvm()
# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is then used to predict labels of test
# examples. As training algorithm Gradient Projection Decomposition Technique
# (GPDT) is used with SVM regularization parameter C=1 and a Gaussian
# kernel of width 2.1. The solver returns an epsilon-precise (epsilon=1e-5) solution.
#
# For more details on GPDT solver see http://dm.unife.it/gpdt .
#
def gpbtsvm ():
print 'GPBTSVM'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import GPBTSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_twoclass)
svm=GPBTSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
gpbtsvm()
# This example shows usage of a k-nearest neighbor (KNN) classification rule on
# a toy data set. The number of the nearest neighbors is set to k=3 and the distances
# are measured by the Euclidean metric. Finally, the KNN rule is applied to predict
# labels of test examples.
def knn ():
print 'KNN'
from shogun.Features import RealFeatures, Labels
from shogun.Classifier import KNN
from shogun.Distance import EuclidianDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclidianDistance(feats_train, feats_train)
k=3
labels=Labels(label_train_multiclass)
knn=KNN(k, distance, labels)
knn.train()
output=knn.classify(feats_test).get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat')
knn()
# In this example a multi-class support vector machine classifier is trained on a
# toy data set and the trained classifier is then used to predict labels of test
# examples. As training algorithm the LaRank algorithm is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 2.1 and a precision
# set to epsilon=1e-5.
#
# For more details on LaRank see
# Bordes, A. and Bottou, L. and Gallinari, P. and Weston, J.
# Solving MultiClass Support Vector Machines with LaRank. ICML 2007.
#
def larank ():
print 'LaRank'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LaRank
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_multiclass)
svm=LaRank(C, kernel, labels)
#svm.set_tau(1e-3)
#svm.set_batch_mode(False)
#svm.io.enable_progress()
svm.set_epsilon(epsilon)
svm.train()
out=svm.classify(feats_train).get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat')
larank()
# In this example a two-class linear classifier based on the Linear Discriminant
# Analysis (LDA) is trained on a toy data set and then the trained classifier is
# used to predict test examples. The regularization parameter, which corresponds
# to a weight of a unitary matrix added to the covariance matrix, is set to
# gamma=3.
#
# For more details on the LDA see e.g.
# http://en.wikipedia.org/wiki/Linear_discriminant_analysis
def lda ():
print 'LDA'
from shogun.Features import RealFeatures, Labels
from shogun.Classifier import LDA
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
gamma=3
num_threads=1
labels=Labels(label_train_twoclass)
lda=LDA(gamma, feats_train, labels)
lda.train()
lda.get_bias()
lda.get_w()
lda.set_features(feats_test)
lda.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
lda()
# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is then used to predict labels of
# test examples. As training algorithm the LIBLINEAR solver is used with the SVM
# regularization parameter C=0.9 and the bias in the classification rule switched
# on and the precision parameters epsilon=1e-5.
#
# For more details on LIBLINEAR see
# http://www.csie.ntu.edu.tw/~cjlin/liblinear/
def liblinear ():
print 'LibLinear'
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import LibLinear
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
C=0.9
epsilon=1e-5
num_threads=1
labels=Labels(label_train_twoclass)
svm=LibLinear(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(True)
svm.train()
svm.set_features(feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
liblinear()
# In this example a two-class support vector machine classifier is trained on a # 2-dimensional randomly generated data set and the trained classifier is used to # predict labels of test examples. As training algorithm the LIBSVM solver is used # with SVM regularization parameter C=1 and a Gaussian kernel of width 2.1. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ from numpy import * from numpy.random import randn from shogun.Features import * from shogun.Classifier import * from shogun.Kernel import * num=1000 dist=1 width=2.1 C=1 traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab=concatenate((-ones(num), ones(num))); testlab=concatenate((-ones(num), ones(num))); feats_train=RealFeatures(traindata_real); feats_test=RealFeatures(testdata_real); kernel=GaussianKernel(feats_train, feats_train, width); labels=Labels(trainlab); svm=LibSVM(C, kernel, labels); svm.train(); kernel.init(feats_train, feats_test); out=svm.classify().get_labels(); testerr=mean(sign(out)!=testlab) print testerr
# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the LIBSVM solver is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 2.1 and the
# precision parameter epsilon=1e-5. The example also shows how to retrieve the
# support vectors from the train SVM model.
#
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/
def libsvm ():
print 'LibSVM'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LibSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_twoclass)
svm=LibSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
sv_idx=svm.get_support_vectors()
alphas=svm.get_alphas()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
libsvm()
# In this example a multi-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the LIBSVM solver is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 2.1 and the
# precision parameter epsilon=1e-5.
#
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/
def libsvm_multiclass ():
print 'LibSVMMultiClass'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LibSVMMultiClass
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_multiclass)
svm=LibSVMMultiClass(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat')
libsvm_multiclass()
# In this example a one-class support vector machine classifier is trained on a
# toy data set. The training algorithm finds a hyperplane in the RKHS which
# separates the training data from the origin. The one-class classifier is
# typically used to estimate the support of a high-dimesnional distribution.
# For more details see e.g.
# B. Schoelkopf et al. Estimating the support of a high-dimensional
# distribution. Neural Computation, 13, 2001, 1443-1471.
#
# In the example, the one-class SVM is trained by the LIBSVM solver with the
# regularization parameter C=1 and the Gaussian kernel of width 2.1 and the
# precision parameter epsilon=1e-5.
#
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/
def libsvm_oneclass ():
print 'LibSVMOneClass'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LibSVMOneClass
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
svm=LibSVMOneClass(C, kernel)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
libsvm_oneclass()
# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the Minimal Primal Dual SVM is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 1.2 and the
# precision parameter 1e-5.
#
# For more details on the MPD solver see
# Kienzle, W. and B. Schölkopf: Training Support Vector Machines with Multiple
# Equality Constraints. Machine Learning: ECML 2005, 182-193. (Eds.) Carbonell,
# J. G., J. Siekmann, Springer, Berlin, Germany (11 2005)
def mpdsvm ():
print 'MPDSVM'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import MPDSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_twoclass)
svm=MPDSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
mpdsvm()
# This example shows usage of the Perceptron algorithm for training a two-class
# linear classifier, i.e. y = sign( <x,w>+b). The Perceptron algorithm works by
# iteratively passing though the training examples and applying the update rule on
# those examples which are misclassified by the current classifier. The Perceptron
# update rule reads
#
# w(t+1) = w(t) + alpha * y_t * x_t
# b(t+1) = b(t) + alpha * y_t
#
# where (x_t,y_t) is feature vector and label (must be +1/-1) of the misclassified example
# (w(t),b(t)) are the current parameters of the linear classifier
# (w(t+1),b(t+1)) are the new parameters of the linear classifier
# alpha is the learning rate; in this examples alpha=1
#
# The Perceptron algorithm iterates until all training examples are correctly
# classified or the prescribed maximal number of iterations, in this example
# max_iter=1000, is reached.
def perceptron ():
print 'Perceptron'
from shogun.Features import RealFeatures, Labels
from shogun.Classifier import Perceptron
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
learn_rate=1.
max_iter=1000
num_threads=1
labels=Labels(label_train_twoclass)
perceptron=Perceptron(feats_train, labels)
perceptron.set_learn_rate(learn_rate)
perceptron.set_max_iter(max_iter)
# only guaranteed to converge for separable data
perceptron.train()
perceptron.set_features(feats_test)
perceptron.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
perceptron()
# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the steepest descent subgradient algorithm is
# used. The SVM regularization parameter is set to C=0.9 and the bias in the
# classification rule is switched off. The solver iterates until it finds an
# epsilon-precise solution (epsilon=1e-3) or the maximal training time
# max_train_time=1 (seconds) is exceeded. The unbiased linear rule is trained.
#
# Note that this solver often does not converges because the steepest descent
# subgradient algorithm is oversensitive to rounding errors. Note also that this
# is an unpublished work which was predecessor of the OCAS solver (see
# classifier_svmocas).
def subgradient_svm ():
print 'SubGradientSVM'
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SubGradientSVM
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
C=0.9
epsilon=1e-3
num_threads=1
max_train_time=1.
labels=Labels(label_train_twoclass)
svm=SubGradientSVM(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(False)
svm.set_max_train_time(max_train_time)
svm.train()
svm.set_features(feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
subgradient_svm()
# In this example a two-class support vector machine classifier is trained on a
# DNA splice-site detection data set and the trained classifier is used to predict
# labels on test set. As training algorithm SVM^light is used with SVM
# regularization parameter C=1 and the Weighted Degree kernel of the degree 20 and
# a precision parameter epsilon=1e-5. The LINADD trick is used to speed up
# training.
#
# For more details on the SVM^light see
# T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
# Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
#
# For more details on the Weighted Degree kernel and the LINADD trick see
# Sonnenburg, s. and Rätsch, G. and Rieck, K. Large Scale Learning with String
# Kernels. In Bottou, Leon and Chapelle, Olivier and DeCoste, Dennis and Weston,
# Jason, editor, In Large Scale Kernel Machines, pages 73-103, MIT Press,
# Cambridge, MA. 2007.
#
def do_batch_linadd ():
print 'SVMlight batch'
from shogun.Features import StringCharFeatures, Labels, DNA
from shogun.Kernel import WeightedDegreeStringKernel
try:
from shogun.Classifier import SVMLight
except ImportError:
print 'No support for SVMLight available.'
return
feats_train=StringCharFeatures(DNA)
feats_train.set_features(fm_train_dna)
feats_test=StringCharFeatures(DNA)
feats_test.set_features(fm_test_dna)
degree=20
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
C=1
epsilon=1e-5
num_threads=2
labels=Labels(label_train_dna)
svm=SVMLight(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.train()
kernel.init(feats_train, feats_test)
#print 'SVMLight Objective: %f num_sv: %d' % \
# (svm.get_objective(), svm.get_num_support_vectors())
svm.set_batch_computation_enabled(False)
svm.set_linadd_enabled(False)
svm.classify().get_labels()
svm.set_batch_computation_enabled(True)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
do_batch_linadd()
# This example demonstrates how to train an SVMLight classifier # using a custom linear term. This is used in the class DASVM that # pre-computes this linear term using a previously trained SVM. # import numpy degree=3 fm_train_dna=['CGCACGTACGTAGCTCGAT', 'CGACGTAGTCGTAGTCGTA', 'CGACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACCACAGTTATATAGTA', 'CGACGTAGTCGTAGTCGTA', 'CGACGTAGTTTTTTTCGTA', 'CGACGTAGTCGTAGCCCCA', 'CAAAAAAAAAAAAAAAATA', 'CGACGGGGGGGGGGGCGTA'] label_train_dna=numpy.array(5*[-1.0] + 5*[1.0]) fm_test_dna=['AGCACGTACGTAGCTCGAT', 'AGACGTAGTCGTAGTCGTA', 'CAACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGAACACAGTTATATAGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACGTGGGGTTTTTCGTA', 'CGACGTAGTCCCAGCCCCA', 'CAAAAAAAAAAAACCAATA', 'CGACGGCCGGGGGGGCGTA'] label_test_dna=numpy.array(5*[-1.0] + 5*[1.0]) print 'SVMLight' from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel from shogun.Classifier import SVMLight feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) C=10 epsilon=1e-5 num_threads=1 labels=Labels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double)); svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.classify().get_labels()
# In this example a two-class support vector machine classifier is trained on a
# DNA splice-site detection data set and the trained classifier is used to predict
# labels on test set. As training algorithm SVM^light is used with SVM
# regularization parameter C=1.2 and the Weighted Degree kernel of degree 20 and
# the precision parameter epsilon=1e-5.
#
# For more details on the SVM^light see
# T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
# Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
#
# For more details on the Weighted Degree kernel see
# G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively
# spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005.
def svm_light ():
print 'SVMLight'
from shogun.Features import StringCharFeatures, Labels, DNA
from shogun.Kernel import WeightedDegreeStringKernel
try:
from shogun.Classifier import SVMLight
except ImportError:
print 'No support for SVMLight available.'
return
feats_train=StringCharFeatures(DNA)
feats_train.set_features(fm_train_dna)
feats_test=StringCharFeatures(DNA)
feats_test.set_features(fm_test_dna)
degree=20
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
C=1.2
epsilon=1e-5
num_threads=1
labels=Labels(label_train_dna)
svm=SVMLight(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
svm_light()
# In this example a two-class linear support vector machine classifier (SVM) is
# trained on a toy data set and the trained classifier is used to predict labels
# of test examples. As training algorithm the SVMLIN solver is used with the SVM
# regularization parameter C=0.9 and the bias in the classification rule switched
# on and the precision parameter epsilon=1e-5. The example also shows how to
# retrieve parameters (vector w and bias b)) of the trained linear classifier.
#
# For more details on the SVMLIN solver see
# V. Sindhwani, S.S. Keerthi. Newton Methods for Fast Solution of Semi-supervised
# Linear SVMs. Large Scale Kernel Machines MIT Press (Book Chapter), 2007
def svmlin ():
print 'SVMLin'
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SVMLin
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
C=0.9
epsilon=1e-5
num_threads=1
labels=Labels(label_train_twoclass)
svm=SVMLin(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(True)
svm.train()
svm.set_features(feats_test)
svm.get_bias()
svm.get_w()
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
svmlin()
# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the OCAS solver is used with the SVM
# regularization parameter C=0.9 and the bias term in the classification rule
# switched off and the precision parameter epsilon=1e-5 (duality gap).
#
# For more details on the OCAS solver see
# V. Franc, S. Sonnenburg. Optimized Cutting Plane Algorithm for Large-Scale Risk
# Minimization.The Journal of Machine Learning Research, vol. 10,
# pp. 2157--2192. October 2009.
#
def svmocas ():
print 'SVMOcas'
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SVMOcas
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
C=0.9
epsilon=1e-5
num_threads=1
labels=Labels(label_train_twoclass)
svm=SVMOcas(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(False)
svm.train()
svm.set_features(feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
svmocas()
# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the Stochastic Gradient Descent (SGD) solver is
# used with the SVM regularization parameter C=0.9. The number of iterations, i.e.
# passes though all training examples, is set to num_iter=5 .
#
# For more details on the SGD solver see
# L. Bottou, O. Bousquet. The tradeoff of large scale learning. In NIPS 20. MIT
# Press. 2008.
def svmsgd ():
print 'SVMSGD'
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SVMSGD
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
C=0.9
num_threads=1
num_iter=5
labels=Labels(label_train_twoclass)
svm=SVMSGD(C, feats_train, labels)
svm.set_epochs(num_iter)
#svm.io.set_loglevel(0)
svm.train()
svm.set_features(feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
svmsgd()
# In this example an agglomerative hierarchical single linkage clustering method
# is used to cluster a given toy data set. Starting with each object being
# assigned to its own cluster clusters are iteratively merged. Here the clusters
# are merged that have the closest (minimum distance, here set via the Euclidean
# distance object) two elements.
def hierarchical ():
print 'Hierarchical'
from shogun.Distance import EuclidianDistance
from shogun.Features import RealFeatures
from shogun.Clustering import Hierarchical
merges=3
feats_train=RealFeatures(fm_train)
distance=EuclidianDistance(feats_train, feats_train)
hierarchical=Hierarchical(merges, distance)
hierarchical.train()
hierarchical.get_merge_distances()
hierarchical.get_cluster_pairs()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train=lm.load_numbers('../data/fm_train_real.dat')
hierarchical()
# In this example the k-means clustering method is used to cluster a given toy
# data set. In k-means clustering one tries to partition n observations into k
# clusters in which each observation belongs to the cluster with the nearest mean.
# The algorithm class constructor takes the number of clusters and a distance to
# be used as input. The distance used in this example is Euclidean distance.
# After training one can fetch the result of clustering by obtaining the cluster
# centers and their radiuses.
#!/usr/bin/env python
"""
Explicit examples on how to use clustering
"""
def kmeans ():
print 'KMeans'
from shogun.Distance import EuclidianDistance
from shogun.Features import RealFeatures
from shogun.Clustering import KMeans
k=3
feats_train=RealFeatures(fm_train)
distance=EuclidianDistance(feats_train, feats_train)
kmeans=KMeans(k, distance)
kmeans.train()
kmeans.get_cluster_centers()
kmeans.get_radiuses()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train=lm.load_numbers('../data/fm_train_real.dat')
kmeans()
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CBrayCurtisDistance.html.
#
# Obviously, using the Bray Curtis distance is not limited to this showcase
# example.
def bray_curtis_distance ():
print 'BrayCurtisDistance'
from shogun.Features import RealFeatures
from shogun.Distance import BrayCurtisDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=BrayCurtisDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
bray_curtis_distance()
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (dissimilarity ratio) matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CCanberraMetric.html.
#
# Obviously, using the Canberra distance is not limited to this showcase
# example.
def canberra_metric ():
print 'CanberaMetric'
from shogun.Features import RealFeatures
from shogun.Distance import CanberraMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=CanberraMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
canberra_metric()
# This example shows how to compute the Canberra Word Distance.
def canberra_word_distance ():
print 'CanberraWordDistance'
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
from shogun.Distance import CanberraWordDistance
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
distance=CanberraWordDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
canberra_word_distance()
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (maximum of absolute feature dimension differences) matrix is
# computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (maximum of absolute feature dimension differences) matrix between these
# two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CChebyshewMetric.html.
#
# Obviously, using the Chebyshew distance is not limited to this showcase
# example.
def chebyshew_metric ():
print 'ChebyshewMetric'
from shogun.Features import RealFeatures
from shogun.Distance import ChebyshewMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=ChebyshewMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
chebyshew_metric()
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CChiSquareDistance.html.
#
# Obviously, using the ChiSquare distance is not limited to this showcase
# example.
def chi_square_distance ():
print 'ChiSquareDistance'
from shogun.Features import RealFeatures
from shogun.Distance import ChiSquareDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=ChiSquareDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
chi_square_distance()
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CCosineDistance.html.
#
# Obviously, using the Cosine distance is not limited to this showcase
# example.
def cosine_distance ():
print 'CosineDistance'
from shogun.Features import RealFeatures
from shogun.Distance import CosineDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=CosineDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
cosine_distance()
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CEuclidianDistance.html.
#
# Obviously, using the Euclidian distance is not limited to this showcase
# example.
def euclidian_distance ():
print 'EuclidianDistance'
from shogun.Features import RealFeatures
from shogun.Distance import EuclidianDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclidianDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
euclidian_distance()
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a
# pairwise distance (shortest path on a sphere) matrix is computed
# by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (shortest path on a sphere) matrix between these two data sets is
# computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CGeodesicMetric.html.
#
# Obviously, using the Geodesic distance is not limited to this showcase
# example.
def geodesic_metric ():
print 'GeodesicMetric'
from shogun.Features import RealFeatures
from shogun.Distance import GeodesicMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=GeodesicMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
geodesic_metric()
# This example shows how to compute the Hamming Word Distance for string features.
def hamming_word_distance ():
print 'HammingWordDistance'
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
from shogun.Distance import HammingWordDistance
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
distance=HammingWordDistance(feats_train, feats_train, use_sign)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
hamming_word_distance()
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (divergence measure based on the Kullback-Leibler divergence) matrix
# is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (divergence measure based on the Kullback-Leibler divergence) matrix between
# these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CJensenMetric.html.
#
# Obviously, using the Jensen-Shannon distance/divergence is not limited to
# this showcase example.
def jensen_metric ():
print 'JensenMetric'
from shogun.Features import RealFeatures
from shogun.Distance import JensenMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=JensenMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
jensen_metric()
# This example shows how to compute the Manhatten Distance.
def manhattan_metric ():
print 'ManhattanMetric'
from shogun.Features import RealFeatures
from shogun.Distance import ManhattanMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=ManhattanMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
manhattan_metric()
# This example shows how to compute the Manahattan Distance for string features.
def manhattan_word_distance ():
print 'ManhattanWordDistance'
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
from shogun.Distance import ManhattanWordDistance
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
distance=ManhattanWordDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
manhattan_word_distance()
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) and norm 'k' controls the processing of the given data points,
# where a pairwise distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CMinkowskiMetric.html.
#
# Obviously, using the Minkowski metric is not limited to this showcase
# example.
def minkowski_metric ():
print 'MinkowskiMetric'
from shogun.Features import RealFeatures
from shogun.Distance import MinkowskiMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
k=3
distance=MinkowskiMetric(feats_train, feats_train, k)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
minkowski_metric()
def norm_squared_distance ():
from shogun.Features import RealFeatures
from shogun.Distance import EuclidianDistance
print 'EuclidianDistance - NormSquared'
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclidianDistance(feats_train, feats_train)
distance.set_disable_sqrt(True)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
norm_squared_distance()
def sparse_euclidian_distance ():
print 'SparseEuclidianDistance'
from shogun.Features import RealFeatures, SparseRealFeatures
from shogun.Distance import SparseEuclidianDistance
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
distance=SparseEuclidianDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
sparse_euclidian_distance()
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (extended Jaccard coefficient) matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (extended Jaccard coefficient) matrix between these two data sets is computed
# by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CTanimotoDistance.html.
#
# Obviously, using the Tanimoto distance/coefficient is not limited to
# this showcase example.
def tanimoto_distance ():
print 'TanimotoDistance'
from shogun.Features import RealFeatures
from shogun.Distance import TanimotoDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=TanimotoDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
tanimoto_distance()
# In this example the Histogram algorithm object computes a histogram over all
# 16bit unsigned integers in the features.
def histogram ():
print 'Histogram'
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.Distribution import Histogram
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_dna)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
histo=Histogram(feats)
histo.train()
histo.get_histogram()
num_examples=feats.get_num_vectors()
num_param=histo.get_num_model_parameters()
#for i in xrange(num_examples):
# for j in xrange(num_param):
# histo.get_log_derivative(j, i)
histo.get_log_likelihood()
histo.get_log_likelihood_sample()
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_dna=lm.load_dna('../data/fm_train_dna.dat')
histogram()
# In this example a hidden markov model with 3 states and 6 transitions is trained
# on a string data set. After calling the constructor of the HMM class specifying
# the number of states and transitions the model is trained. Via the Baum-Welch
# algorithm the optimal transition and emission probabilities are estimated. The
# best path, i.e. the path with highest probability given the model can then be
# calculated using get_best_path_state.
def hmm ():
print 'HMM'
from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE
from shogun.Distribution import HMM, BW_NORMAL
N=3
M=6
pseudo=1e-1
order=1
gap=0
reverse=False
num_examples=2
charfeat=StringCharFeatures(CUBE)
charfeat.set_features(fm_cube)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
hmm=HMM(feats, N, M, pseudo)
hmm.train()
hmm.baum_welch_viterbi_train(BW_NORMAL)
num_examples=feats.get_num_vectors()
num_param=hmm.get_num_model_parameters()
for i in xrange(num_examples):
for j in xrange(num_param):
hmm.get_log_derivative(j, i)
best_path=0
best_path_state=0
for i in xrange(num_examples):
best_path+=hmm.best_path(i)
for j in xrange(N):
best_path_state+=hmm.get_best_path_state(i, j)
hmm.get_log_likelihood()
hmm.get_log_likelihood_sample()
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_cube=lm.load_cubes('../data/fm_train_cube.dat')
hmm()
# Trains an inhomogeneous Markov chain of order 3 on a DNA string data set. Due to
# the structure of the Markov chain it is very similar to a HMM with just one
# chain of connected hidden states - that is why we termed this linear HMM.
def linear_hmm ():
print 'LinearHMM'
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.Distribution import LinearHMM
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_dna)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
hmm=LinearHMM(feats)
hmm.train()
hmm.get_transition_probs()
num_examples=feats.get_num_vectors()
num_param=hmm.get_num_model_parameters()
for i in xrange(num_examples):
for j in xrange(num_param):
hmm.get_log_derivative(j, i)
hmm.get_log_likelihood()
hmm.get_log_likelihood_sample()
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_dna=lm.load_dna('../data/fm_train_dna.dat')
linear_hmm()
# This example shows how to read and write plain ascii files, binary files and
# hdf5 datasets.
#
# For ascii files it shows how to obtain shogun's RealFeatures
# (a simple feature matrix of doubles with 1 column == 1 example, nr_columns ==
# number of examples) and also sparse features in SVM light format.
#
# Binary files use some custom native format and datasets can be read/written
# from/to hdf5 files with arbitrary group / path.
def io ():
print 'Features IO'
import numpy
from shogun.Features import SparseRealFeatures, RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Library import AsciiFile, BinaryFile, HDF5File
feats=SparseRealFeatures(fm_train_real)
feats2=SparseRealFeatures()
f=BinaryFile("fm_train_sparsereal.bin","w")
feats.save(f)
f=AsciiFile("fm_train_sparsereal.ascii","w")
feats.save(f)
f=BinaryFile("fm_train_sparsereal.bin")
feats2.load(f)
f=AsciiFile("fm_train_sparsereal.ascii")
feats2.load(f)
feats=RealFeatures(fm_train_real)
feats2=RealFeatures()
f=BinaryFile("fm_train_real.bin","w")
feats.save(f)
f=HDF5File("fm_train_real.h5","w", "/data/doubles")
feats.save(f)
f=AsciiFile("fm_train_real.ascii","w")
feats.save(f)
f=BinaryFile("fm_train_real.bin")
feats2.load(f)
print "diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))
f=AsciiFile("fm_train_real.ascii")
feats2.load(f)
print "diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))
lab=Labels(numpy.array([1.0,2.0,3.0]))
lab2=Labels()
f=AsciiFile("label_train_twoclass.ascii","w")
lab.save(f)
f=BinaryFile("label_train_twoclass.bin","w")
lab.save(f)
f=HDF5File("fm_train_real.h5","a", "/data/labels")
lab.save(f)
f=AsciiFile("label_train_twoclass.ascii")
lab2.load(f)
f=BinaryFile("label_train_twoclass.bin")
lab2.load(f)
f=HDF5File("fm_train_real.h5","r", "/data/doubles")
feats2.load(f)
print feats2.get_feature_matrix()
f=HDF5File("fm_train_real.h5","r", "/data/labels")
lab2.load(f)
print lab2.get_labels()
#clean up
import os
for f in ['fm_train_sparsereal.bin','fm_train_sparsereal.ascii',
'fm_train_real.bin','fm_train_real.h5','fm_train_real.ascii',
'label_train_twoclass.ascii','label_train_twoclass.bin']:
os.unlink(f)
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
label_train_twoclass=lm.load_numbers('../data/label_train_twoclass.dat')
io()
# This example demonstrates how to read and write data in the SVMLight Format
# from Shogun.
#
import os
from shogun.Features import SparseRealFeatures
f=SparseRealFeatures()
lab=f.load_svmlight_file('../data/train_sparsereal.light')
f.write_svmlight_file('testwrite.light', lab)
os.unlink('testwrite.light')
# This example demonstrates how to encode small positive natural numbers # (up to 255) in shogun using ByteFeatures. from shogun.Features import ByteFeatures from numpy import array, uint8, all # create dense matrix A A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8) # ... of type Byte a=ByteFeatures(A) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=uint8), 0) # get matrix a_out = a.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A))
# This example demonstrates, how to encode features composed of 64bit Integers in Shogun # using LongIntFeatures. from shogun.Features import LongIntFeatures from numpy import array, int64, all # create dense matrix A A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64) # ... of type LongInt a=LongIntFeatures(A) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=int64), 0) # get matrix a_out = a.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A))
# This example shows how to encode features that live in various vector spaces # using the appropriate shogun objects. We demonstrate how to use # three types of features: ByteFeatures (small integer values), # LongIntFeatures (large integer values) and finally RealFeatures # (real-valued vectors). from shogun.Features import RealFeatures, LongIntFeatures, ByteFeatures from numpy import array, float64, int64, uint8, all # create dense matrices A,B,C A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) B=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64) C=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8) # ... of type Real, LongInt and Byte a=RealFeatures(A) b=LongIntFeatures(B) c=ByteFeatures(C) # or 16bit wide ... #feat1 = f.ShortFeatures(N.zeros((10,5),N.short)) #feat2 = f.WordFeatures(N.zeros((10,5),N.uint16)) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrices a_out = a.get_feature_matrix() b_out = b.get_feature_matrix() c_out = c.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A)) print type(b_out), b_out.dtype print b_out assert(all(b_out==B)) print type(c_out), c_out.dtype print c_out assert(all(c_out==C))
# This examples demonstrates how to encode real-valued features in Shogun, # using RealFeatures. from shogun.Features import RealFeatures from numpy import array, float64, all # create dense matrices A,B,C A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) # ... of type Real, LongInt and Byte a=RealFeatures(A) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrix a_out = a.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A))
# Creates features similar to the feature space of the SNP kernel. Useful when
# working with linear methods.
from shogun.Features import *
from numpy import *
sf=StringByteFeatures(DIGIT2)
sf.load_ascii_file('x', False, DIGIT2, DIGIT2)
print sf.get_features()
snps=SNPFeatures(sf)
print snps.get_feature_matrix()
print snps.get_minor_base_string()
print snps.get_major_base_string()
# This example demsonstrates how to encode sparse (most entries zero), # real-valued features in shogun using SparseRealFeatures. from scipy.sparse import csc_matrix from shogun.Features import SparseRealFeatures from numpy import array, float64, all # create dense matrix A and its sparse representation X # note, will work with types other than float64 too, # but requires recent scipy.sparse A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) X=csc_matrix(A) print A # create sparse shogun features from dense matrix A a=SparseRealFeatures(A) a_out=a.get_full_feature_matrix() print a_out assert(all(a_out==A)) print a_out # create sparse shogun features from sparse matrix X a.set_sparse_feature_matrix(X) a_out=a.get_full_feature_matrix() print a_out assert(all(a_out==A)) # create sparse shogun features from sparse matrix X a=SparseRealFeatures(X) a_out=a.get_full_feature_matrix() print a_out assert(all(a_out==A)) # obtain (data,row,indptr) csc arrays of sparse shogun features z=csc_matrix(a.get_sparse_feature_matrix()) z_out=z.todense() print z_out assert(all(z_out==A))
# This example demonstrates how to use compressed strings with shogun.
# We currently support reading and writing compressed files using
# LZO, GZIP, BZIP2 and LZMA. Furthermore, we demonstrate how to extract
# compressed streams on-the-fly in order to fit data sets into
# memory that would be too large, otherwise.
#
from shogun.Features import StringCharFeatures, StringFileCharFeatures, RAWBYTE
from shogun.Library import UNCOMPRESSED,LZO,GZIP,BZIP2,LZMA, MSG_DEBUG
from shogun.PreProc import DecompressCharString
f=StringFileCharFeatures('features_string_char_compressed_modular.py', RAWBYTE)
print "original strings", f.get_features()
#uncompressed
f.save_compressed("foo_uncompressed.str", UNCOMPRESSED, 1)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_uncompressed.str", True)
print "uncompressed strings", f2.get_features()
print
# load compressed data and uncompress on load
#lzo
f.save_compressed("foo_lzo.str", LZO, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzo.str", True)
print "lzo strings", f2.get_features()
print
##gzip
f.save_compressed("foo_gzip.str", GZIP, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_gzip.str", True)
print "gzip strings", f2.get_features()
print
#bzip2
f.save_compressed("foo_bzip2.str", BZIP2, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_bzip2.str", True)
print "bzip2 strings", f2.get_features()
print
#lzma
f.save_compressed("foo_lzma.str", LZMA, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzma.str", True)
print "lzma strings", f2.get_features()
print
# load compressed data and uncompress via preprocessor
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzo.str", False)
f2.add_preproc(DecompressCharString(LZO))
f2.apply_preproc()
print "lzo strings", f2.get_features()
print
# load compressed data and uncompress on-the-fly via preprocessor
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzo.str", False)
f2.io.set_loglevel(MSG_DEBUG)
f2.add_preproc(DecompressCharString(LZO))
f2.enable_on_the_fly_preprocessing()
print "lzo strings", f2.get_features()
print
#clean up
import os
for f in ['foo_uncompressed.str', 'foo_lzo.str', 'foo_gzip.str',
'foo_bzip2.str', 'foo_lzma.str', 'foo_lzo.str', 'foo_lzo.str']:
if os.path.exists(f):
os.unlink(f)
##########################################################################################
# some perfectly compressible stuff follows
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
# This example demonstrates how to encode ASCII-strings (255 symbols) in shogun. from shogun.Features import StringCharFeatures, RAWBYTE from numpy import array #create string features f=StringCharFeatures(['hey','guys','i','am','a','string'], RAWBYTE) #and output several stats print "max string length", f.get_max_vector_length() print "number of strings", f.get_num_vectors() print "length of first string", f.get_vector_length(0) print "string[5]", ''.join(f.get_feature_vector(5)) print "strings", f.get_features() #replace string 0 f.set_feature_vector(array(['t','e','s','t']), 0) print "strings", f.get_features()
# This example demonstrates how to load ASCII features from a file into shogun.
from shogun.Features import StringFileCharFeatures, RAWBYTE
f = StringFileCharFeatures('features_string_file_char_modular.py', RAWBYTE)
print "strings", f.get_features()
# This example demonstrates how to load string features from files.
# We cover two cases: First, we show how to obtain StringCharFeatues
# from a directory of text files (particularly useful in computational biology)
# and second, we demonstrate how to load StringCharFeatues from one (multi-line) file.
#
from shogun.Features import StringCharFeatures, RAWBYTE
# load features from directory
f=StringCharFeatures(RAWBYTE)
f.load_from_directory(".")
#and output several stats
print "max string length", f.get_max_vector_length()
print "number of strings", f.get_num_vectors()
print "length of first string", f.get_vector_length(0)
print "str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2)
print "len(str[0])", f.get_vector_length(0)
print "str[0]", f.get_feature_vector(0)
#or load features from file (one string per line)
f.load('features_string_char_modular.py')
print f.get_features()
#or load fasta file
#f.load_fasta('fasta.fa')
#print f.get_features()
# This creates a HashedWDFeatures object, i.e. an approximation to the Weighted # Degree kernel feature space via hashes. These features can be particularly fast # in linear SVM solvers. from numpy import * from shogun.Features import * from shogun.Library import MSG_DEBUG order=3 start_order=1 from_order=order hash_bits=2 x=[array([0,1,2,3,0,1,2,3,3,2,2,1,1],dtype=uint8)] print len(x[0]) f=StringByteFeatures(RAWDNA) f.io.set_loglevel(MSG_DEBUG) f.set_features(x) y=HashedWDFeatures(f,start_order,order,from_order,hash_bits) print y.get_dim_feature_space() fm=y.get_feature_matrix() print fm.shape print fm
# In this example, we demonstrate how to obtain string features # by using a sliding window in a memory-efficient way. Instead of copying # the string for each position of the sliding window, we only store a reference # with respect to the complete string. This is particularly useful, when working # with genomic data, where storing all explicitly copied strings in memory # quickly becomes infeasible. In addition to a sliding window (of a particular # length) over all position, we also support defining a custom position # list. from shogun.Features import StringCharFeatures, DNA from shogun.Library import DynamicIntArray # create string features with a single string s=10*'A' + 10*'C' + 10*'G' + 10*'T' f=StringCharFeatures([s], DNA) # slide a window of length 5 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(5,1) print f.get_num_vectors() print f.get_vector_length(0) print f.get_vector_length(1) print f.get_features() # slide a window of length 4 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(4,1) print f.get_num_vectors() print f.get_vector_length(0) print f.get_vector_length(1) print f.get_features() # extract string-windows at position 0,6,16,25 of window size 4 # (memory efficient, does not copy strings) f.set_features([s]) positions=DynamicIntArray() positions.append_element(0) positions.append_element(6) positions.append_element(16) positions.append_element(25) f.obtain_by_position_list(4,positions) print f.get_features() # now extract windows of size 8 from same positon list f.obtain_by_position_list(8,positions) print f.get_features()
# This example demonstrates how to encode string
# features efficiently by creating a more compactly encoded
# bit-string from StringCharFeatures.
# For instance, when working with the DNA alphabet {A,T,G,C}
# using 1 char = 1 byte per symbol would be wasteful, as we
# can encode 4 symbols using 2 bits only.
# Here, this is done in junks of 64bit (ulong).
from shogun.Features import StringCharFeatures, StringUlongFeatures, RAWBYTE
from numpy import array, uint64
#create string features
cf=StringCharFeatures(['hey','guys','string'], RAWBYTE)
uf=StringUlongFeatures(RAWBYTE)
#start=0, order=2, gap=0, rev=False)
uf.obtain_from_char(cf, 0, 2, 0, False)
#and output several stats
print "max string length", uf.get_max_vector_length()
print "number of strings", uf.get_num_vectors()
print "length of first string", uf.get_vector_length(0)
print "string[2]", uf.get_feature_vector(2)
print "strings", uf.get_features()
#replace string 0
uf.set_feature_vector(array([1,2,3,4,5], dtype=uint64), 0)
print "strings", uf.get_features()
# This example demonstrates how to encode string
# features efficiently by creating a more compactly encoded
# bit-string from StringCharFeatures.
# For instance, when working with the DNA alphabet {A,T,G,C}
# using 1 char = 1 byte per symbol would be wasteful, as we
# can encode 4 symbols using 2 bits only.
# Here, this is done in junks of 16bit (word).
from shogun.Features import StringCharFeatures, StringWordFeatures, RAWBYTE
from numpy import array, uint16
#create string features
cf=StringCharFeatures(['hey','guys','string'], RAWBYTE)
wf=StringWordFeatures(RAWBYTE)
#start=0, order=2, gap=0, rev=False)
wf.obtain_from_char(cf, 0, 2, 0, False)
#and output several stats
print "max string length", wf.get_max_vector_length()
print "number of strings", wf.get_num_vectors()
print "length of first string", wf.get_vector_length(0)
print "string[2]", wf.get_feature_vector(2)
print "strings", wf.get_features()
#replace string 0
wf.set_feature_vector(array([1,2,3,4,5], dtype=uint16), 0)
print "strings", wf.get_features()
# This example demonstrates the use of the AUC Kernel.
###########################################################################
# kernel can be used to maximize AUC instead of margin in SVMs
###########################################################################
def auc ():
print 'AUC'
from shogun.Kernel import GaussianKernel, AUCKernel
from shogun.Features import RealFeatures, Labels
feats_train=RealFeatures(fm_train_real)
width=1.7
subkernel=GaussianKernel(feats_train, feats_train, width)
kernel=AUCKernel(0, subkernel)
kernel.setup_auc_maximization( Labels(label_train_real) )
km_train=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat'))
label_train_real=lm.load_labels('../data/label_train_twoclass.dat')
auc()
# This is an example for the initialization of the chi2-kernel on real data, where
# each column of the matrices corresponds to one training/test example.
###########################################################################
# chi2 kernel
###########################################################################
def chi2 ():
print 'Chi2'
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=1.4
size_cache=10
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat'))
fm_test_real=double(lm.load_numbers('../data/fm_test_real.dat'))
chi2()
def combined_custom():
from shogun.Features import CombinedFeatures, RealFeatures, Labels
from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel
from shogun.Classifier import LibSVM
kernel = CombinedKernel()
feats_train = CombinedFeatures()
tfeats = RealFeatures(fm_train_real)
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, tfeats)
K = tkernel.get_kernel_matrix()
kernel.append_kernel(CustomKernel(K))
subkfeats_train = RealFeatures(fm_train_real)
feats_train.append_feature_obj(subkfeats_train)
subkernel = PolyKernel(10,2)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
labels = Labels(fm_label_twoclass)
svm = LibSVM(1.0, kernel, labels)
svm.train()
kernel = CombinedKernel()
feats_pred = CombinedFeatures()
pfeats = RealFeatures(fm_test_real)
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, pfeats)
K = tkernel.get_kernel_matrix()
kernel.append_kernel(CustomKernel(K))
subkfeats_test = RealFeatures(fm_test_real)
feats_pred.append_feature_obj(subkfeats_test)
subkernel = PolyKernel(10, 2)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_pred)
svm.set_kernel(kernel)
svm.classify()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real = lm.load_numbers('../data/fm_train_real.dat')
fm_test_real = lm.load_numbers('../data/fm_test_real.dat')
fm_label_twoclass = lm.load_labels('../data/label_train_twoclass.dat')
combined_custom()
# This is an example for the initialization of a combined kernel, which is a weighted sum of
# in this case three kernels on real valued data. The sub-kernel weights are all set to 1.
#
def combined():
print 'Combined'
from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA
kernel=CombinedKernel()
feats_train=CombinedFeatures()
feats_test=CombinedFeatures()
subkfeats_train=RealFeatures(fm_train_real)
subkfeats_test=RealFeatures(fm_test_real)
subkernel=GaussianKernel(10, 1.1)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
degree=3
subkernel=FixedDegreeStringKernel(10, degree)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
subkernel=LocalAlignmentStringKernel(10)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat'))
fm_test_real=double(lm.load_numbers('../data/fm_test_real.dat'))
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
combined()
# This is an example for the initialization of the CommUlongString-kernel. This kernel
# sums over k-mere matches (k='order'). For efficient computing a preprocessor is used
# that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted
# only once.
def comm_ulong_string ():
print 'CommUlongString'
from shogun.Kernel import CommUlongStringKernel
from shogun.Features import StringUlongFeatures, StringCharFeatures, DNA
from shogun.PreProc import SortUlongString
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringUlongFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortUlongString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringUlongFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
comm_ulong_string()
# This is an example for the initialization of the CommWordString-kernel (aka
# Spectrum or n-gram kernel; its name is derived from the unix command comm). This kernel
# sums over k-mere matches (k='order'). For efficient computing a preprocessor is used
# that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted
# only once.
def comm_word_string ():
print 'CommWordString'
from shogun.Kernel import CommWordStringKernel
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.PreProc import SortWordString
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=CommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
comm_word_string()
# The constant kernel gives a trivial kernel matrix with all entries set to the same value # defined by the argument 'c'. # def const (): print 'Const' from shogun.Features import DummyFeatures from shogun.Kernel import ConstKernel feats_train=DummyFeatures(10) feats_test=DummyFeatures(17) c=23. kernel=ConstKernel(feats_train, feats_train, c) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': const()
# A user defined custom kernel is assigned in this example, for which only the lower triangle # may be given (set_triangle_kernel_matrix_from_triangle) or # a full matrix (set_full_kernel_matrix_from_full), or a full matrix which is then internally stored as a # triangle (set_triangle_kernel_matrix_from_full). Labels for the examples are given, a svm is trained and # the svm is used to classify the examples. # def custom (): print 'Custom' from numpy.random import rand from numpy import array, float32 from shogun.Features import RealFeatures from shogun.Kernel import CustomKernel dim=7 data=rand(dim, dim) feats=RealFeatures(data) symdata=data+data.T lowertriangle=array([symdata[(x,y)] for x in xrange(symdata.shape[1]) for y in xrange(symdata.shape[0]) if y<=x]) kernel=CustomKernel() # once with float64's kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle) km_triangletriangle=kernel.get_kernel_matrix() kernel.set_triangle_kernel_matrix_from_full(symdata) km_fulltriangle=kernel.get_kernel_matrix() kernel.set_full_kernel_matrix_from_full(data) km_fullfull=kernel.get_kernel_matrix() # now once with float32's data=array(data,dtype=float32) kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle) km_triangletriangle=kernel.get_kernel_matrix() kernel.set_triangle_kernel_matrix_from_full(symdata) km_fulltriangle=kernel.get_kernel_matrix() kernel.set_full_kernel_matrix_from_full(data) km_fullfull=kernel.get_kernel_matrix() if __name__=='__main__': from numpy.random import seed seed(42) custom()
# This is an example for the initialization of the diag-kernel. # The diag kernel has all kernel matrix entries but those on # the main diagonal set to zero. def diag (): print 'Diag' from shogun.Features import DummyFeatures from shogun.Kernel import DiagKernel feats_train=DummyFeatures(10) feats_test=DummyFeatures(17) diag=23. kernel=DiagKernel(feats_train, feats_train, diag) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': diag()
# With the distance kernel one can use any of the following distance metrics:
# BrayCurtisDistance()
# CanberraMetric()
# CanberraWordDistance()
# ChebyshewMetric()
# ChiSquareDistance()
# CosineDistance()
# Distance()
# EuclidianDistance()
# GeodesicMetric()
# HammingWordDistance()
# JensenMetric()
# ManhattanMetric()
# ManhattanWordDistance()
# MinkowskiMetric()
# RealDistance()
# SimpleDistance()
# SparseDistance()
# SparseEuclidianDistance()
# StringDistance()
# TanimotoDistance()
#
def distance ():
print 'Distance'
from shogun.Kernel import DistanceKernel
from shogun.Features import RealFeatures
from shogun.Distance import EuclidianDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=1.7
distance=EuclidianDistance()
kernel=DistanceKernel(feats_train, feats_test, width, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat'))
fm_test_real=double(lm.load_numbers('../data/fm_test_real.dat'))
distance()
# The class FKFeatures implements Fischer kernel features obtained from
# two Hidden Markov models.
#
# It was used in
#
# K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new
# discriminative kernel from probabilistic models. Neural Computation,
# 14:2397-2414, 2002.
#
# which also has the details.
#
# Note that FK-features are computed on the fly, so to be effective feature
# caching should be enabled.
#
# It inherits its functionality from CSimpleFeatures, which should be
# consulted for further reference.
#
def fisher ():
print "Fisher Kernel"
from shogun.Features import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
from shogun.Kernel import PolyKernel
from shogun.Distribution import HMM, BW_NORMAL
N=1 # toy HMM with 1 state
M=4 # 4 observations -> DNA
pseudo=1e-1
order=1
gap=0
reverse=False
kargs=[1, False, True]
# train HMM for positive class
charfeat=StringCharFeatures(fm_hmm_pos, DNA)
hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
pos=HMM(hmm_pos_train, N, M, pseudo)
pos.baum_welch_viterbi_train(BW_NORMAL)
# train HMM for negative class
charfeat=StringCharFeatures(fm_hmm_neg, DNA)
hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
neg=HMM(hmm_neg_train, N, M, pseudo)
neg.baum_welch_viterbi_train(BW_NORMAL)
# Kernel training data
charfeat=StringCharFeatures(fm_train_dna, DNA)
wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
# Kernel testing data
charfeat=StringCharFeatures(fm_test_dna, DNA)
wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
# get kernel on training data
pos.set_observations(wordfeats_train)
neg.set_observations(wordfeats_train)
feats_train=FKFeatures(10, pos, neg)
feats_train.set_opt_a(-1) #estimate prior
kernel=PolyKernel(feats_train, feats_train, *kargs)
km_train=kernel.get_kernel_matrix()
# get kernel on testing data
pos_clone=HMM(pos)
neg_clone=HMM(neg)
pos_clone.set_observations(wordfeats_test)
neg_clone.set_observations(wordfeats_test)
feats_test=FKFeatures(10, pos_clone, neg_clone)
feats_test.set_a(feats_train.get_a()) #use prior from training data
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
fm_hmm_pos=[ fm_train_dna[i] for i in where([label_train_dna==1])[1] ]
fm_hmm_neg=[ fm_train_dna[i] for i in where([label_train_dna==-1])[1] ]
fisher()
# The FixedDegree String kernel takes as input two strings of same size and counts the number of matches of length d.
def fixed_degree_string ():
print 'FixedDegreeString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import FixedDegreeStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
degree=3
kernel=FixedDegreeStringKernel(feats_train, feats_train, degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
fixed_degree_string()
# The well known Gaussian kernel (swiss army knife for SVMs) on dense real valued features.
def gaussian ():
print 'Gaussian'
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=1.9
kernel=GaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
gaussian()
# An experimental kernel inspired by the WeightedDegreePositionStringKernel and the Gaussian kernel.
# The idea is to shift the dimensions of the input vectors against eachother. 'shift_step' is the step
# size of the shifts and max_shift is the maximal shift.
def gaussian_shift ():
print 'GaussianShift'
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianShiftKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=1.8
max_shift=2
shift_step=1
kernel=GaussianShiftKernel(
feats_train, feats_train, width, max_shift, shift_step)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
gaussian_shift()
# The HistogramWordString computes the TOP kernel on inhomogeneous Markov Chains.
def plugin_estimate_histogram ():
print 'PluginEstimate w/ HistogramWord'
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels
from shogun.Kernel import HistogramWordStringKernel
from shogun.Classifier import PluginEstimate
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
pie=PluginEstimate()
labels=Labels(label_train_dna)
pie.set_labels(labels)
pie.set_features(feats_train)
pie.train()
kernel=HistogramWordStringKernel(feats_train, feats_train, pie)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
pie.set_features(feats_test)
pie.classify().get_labels()
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
plugin_estimate_histogram()
# example on saving a kernel to a file
def gaussian ():
print 'Gaussian'
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Library import AsciiFile, BinaryFile
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=1.9
kernel=GaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
f=AsciiFile("gaussian_train.ascii","w")
kernel.save(f)
del f
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
f=AsciiFile("gaussian_test.ascii","w")
kernel.save(f)
del f
#clean up
import os
os.unlink("gaussian_test.ascii")
os.unlink("gaussian_train.ascii")
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
gaussian()
# This is an example for the initialization of a linear kernel on raw byte
# data.
###########################################################################
# linear kernel on byte features
###########################################################################
def linear_byte():
print 'LinearByte'
from shogun.Kernel import LinearByteKernel
from shogun.Features import ByteFeatures
feats_train=ByteFeatures(fm_train_byte)
feats_test=ByteFeatures(fm_test_byte)
kernel=LinearByteKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import ubyte
lm=LoadMatrix()
fm_train_byte=ubyte(lm.load_numbers('../data/fm_train_byte.dat'))
fm_test_byte=ubyte(lm.load_numbers('../data/fm_test_byte.dat'))
linear_byte()
# This is an example for the initialization of a linear kernel on real valued
# data using scaling factor 1.2.
def linear ():
print 'Linear'
from shogun.Features import RealFeatures
from shogun.Kernel import LinearKernel, AvgDiagKernelNormalizer
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
scale=1.2
kernel=LinearKernel()
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
linear()
# This is an example for the initialization of a linear kernel on string data. The
# strings are all of the same length and consist of the characters 'ACGT' corresponding
# to the DNA-alphabet. Each column of the matrices of type char corresponds to
# one training/test example.
def linear_string ():
print 'LinearString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import LinearStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=LinearStringKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
linear_string()
# This is an example for the initialization of a linear kernel on word (2byte)
# data.
def linear_word ():
print 'LinearWord'
from shogun.Kernel import LinearWordKernel, AvgDiagKernelNormalizer
from shogun.Features import WordFeatures
feats_train=WordFeatures(fm_train_word)
feats_test=WordFeatures(fm_test_word)
scale=1.4
kernel=LinearWordKernel()
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import ushort
lm=LoadMatrix()
fm_train_word=ushort(lm.load_numbers('../data/fm_test_word.dat'))
fm_test_word=ushort(lm.load_numbers('../data/fm_test_word.dat'))
linear_word()
# This is an example for the initialization of the local alignment kernel on
# DNA sequences, where each column of the matrices of type char corresponds to
# one training/test example.
def local_alignment_string():
print 'LocalAlignmentString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import LocalAlignmentStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=LocalAlignmentStringKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
local_alignment_string()
# The LocalityImprovedString kernel is inspired by the polynomial kernel.
# Comparing neighboring characters it puts emphasize on local features.
#
# It can be defined as
# K({\bf x},{\bf x'})=\left(\sum_{i=0}^{T-1}\left(\sum_{j=-l}^{+l}w_jI_{i+j}({\bf x},{\bf x'})\right)^{d_1}\right)^{d_2},
# where
# I_i({\bf x},{\bf x'})=1
# if $x_i=x'_i and 0 otherwise.
#
def locality_improved_string ():
print 'LocalityImprovedString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import LocalityImprovedStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
length=5
inner_degree=5
outer_degree=7
kernel=LocalityImprovedStringKernel(
feats_train, feats_train, length, inner_degree, outer_degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
locality_improved_string()
def match_word_string ():
print 'MatchWordString'
from shogun.Kernel import MatchWordStringKernel, AvgDiagKernelNormalizer
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
degree=3
scale=1.4
size_cache=10
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(DNA)
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(DNA)
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
kernel=MatchWordStringKernel(size_cache, degree)
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
match_word_string()
# This is an example initializing the oligo string kernel which takes distances
# between matching oligos (k-mers) into account via a gaussian. Variable 'k' defines the length
# of the oligo and variable 'w' the width of the gaussian. The oligo string kernel is
# implemented for the DNA-alphabet 'ACGT'.
#
def oligo_string ():
print 'OligoString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import OligoStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
k=3
width=1.2
size_cache=10
kernel=OligoStringKernel(size_cache, k, width)
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
oligo_string()
def poly_match_string ():
print 'PolyMatchString'
from shogun.Kernel import PolyMatchStringKernel
from shogun.Features import StringCharFeatures, DNA
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_train_dna, DNA)
degree=3
inhomogene=False
kernel=PolyMatchStringKernel(feats_train, feats_train, degree, inhomogene)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
poly_match_string()
# This is an example for the initialization of the PolyMatchString kernel on string data.
# The PolyMatchString kernel sums over the matches of two stings of the same length and
# takes the sum to the power of 'degree'. The strings consist of the characters 'ACGT' corresponding
# to the DNA-alphabet. Each column of the matrices of type char corresponds to
# one training/test example.
def poly_match_word_string ():
print 'PolyMatchWordString'
from shogun.Kernel import PolyMatchWordStringKernel
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
degree=2
inhomogene=True
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(DNA)
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(DNA)
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
kernel=PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
poly_match_word_string()
# This example initializes the polynomial kernel with real data.
# If variable 'inhomogene' is 'True' +1 is added to the scalar product
# before taking it to the power of 'degree'. If 'use_normalization' is
# set to 'true' then kernel matrix will be normalized by the square roots
# of the diagonal entries.
def poly ():
print 'Poly'
from shogun.Features import RealFeatures
from shogun.Kernel import PolyKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
degree=4
inhomogene=False
use_normalization=True
kernel=PolyKernel(
feats_train, feats_train, degree, inhomogene, use_normalization)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
poly()
# The SalzbergWordString kernel implements the Salzberg kernel.
#
# It is described in
#
# Engineering Support Vector Machine Kernels That Recognize Translation Initiation Sites
# A. Zien, G.Raetsch, S. Mika, B. Schoelkopf, T. Lengauer, K.-R. Mueller
#
def plugin_estimate_salzberg ():
print 'PluginEstimate w/ SalzbergWord'
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels
from shogun.Kernel import SalzbergWordStringKernel
from shogun.Classifier import PluginEstimate
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
pie=PluginEstimate()
labels=Labels(label_train_dna)
pie.set_labels(labels)
pie.set_features(feats_train)
pie.train()
kernel=SalzbergWordStringKernel(feats_train, feats_test, pie, labels)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
pie.set_features(feats_test)
pie.classify().get_labels()
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
plugin_estimate_salzberg()
# The standard Sigmoid kernel computed on dense real valued features.
def sigmoid ():
print 'Sigmoid'
from shogun.Features import RealFeatures
from shogun.Kernel import SigmoidKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
size_cache=10
gamma=1.2
coef0=1.3
kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
sigmoid()
# SimpleLocalityImprovedString kernel, is a `simplified' and better performing version of the Locality improved kernel.
def simple_locality_improved_string ():
print 'SimpleLocalityImprovedString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import SimpleLocalityImprovedStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
length=5
inner_degree=5
outer_degree=7
kernel=SimpleLocalityImprovedStringKernel(
feats_train, feats_train, length, inner_degree, outer_degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
simple_locality_improved_string()
# This example demonstrates how to use the Gaussian Kernel with sparse features.
def sparse_gaussian ():
print 'SparseGaussian'
from shogun.Features import SparseRealFeatures
from shogun.Kernel import SparseGaussianKernel
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
width=1.1
kernel=SparseGaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
sparse_gaussian()
# This example demonstrates how to use the Linear Kernel with sparse features.
def sparse_linear ():
print 'SparseLinear'
from shogun.Features import SparseRealFeatures
from shogun.Kernel import SparseLinearKernel, AvgDiagKernelNormalizer
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
scale=1.1
kernel=SparseLinearKernel()
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
sparse_linear()
# This example shows how to use the polynomial kernel with sparse features.
def sparse_poly ():
print 'SparsePoly'
from shogun.Features import SparseRealFeatures
from shogun.Kernel import SparsePolyKernel
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
size_cache=10
degree=3
inhomogene=True
kernel=SparsePolyKernel(feats_train, feats_train, size_cache, degree,
inhomogene)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
sparse_poly()
# The class TOPFeatures implements TOP kernel features obtained from
# two Hidden Markov models.
#
# It was used in
#
# K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new
# discriminative kernel from probabilistic models. Neural Computation,
# 14:2397-2414, 2002.
#
# which also has the details.
#
# Note that TOP-features are computed on the fly, so to be effective feature
# caching should be enabled.
#
# It inherits its functionality from CSimpleFeatures, which should be
# consulted for further reference.
#
def top():
print "TOP Kernel"
from shogun.Features import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
from shogun.Kernel import PolyKernel
from shogun.Distribution import HMM, BW_NORMAL
N=1 # toy HMM with 1 state
M=4 # 4 observations -> DNA
pseudo=1e-1
order=1
gap=0
reverse=False
kargs=[1, False, True]
# train HMM for positive class
charfeat=StringCharFeatures(fm_hmm_pos, DNA)
hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
pos=HMM(hmm_pos_train, N, M, pseudo)
pos.baum_welch_viterbi_train(BW_NORMAL)
# train HMM for negative class
charfeat=StringCharFeatures(fm_hmm_neg, DNA)
hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
neg=HMM(hmm_neg_train, N, M, pseudo)
neg.baum_welch_viterbi_train(BW_NORMAL)
# Kernel training data
charfeat=StringCharFeatures(fm_train_dna, DNA)
wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
# Kernel testing data
charfeat=StringCharFeatures(fm_test_dna, DNA)
wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
# get kernel on training data
pos.set_observations(wordfeats_train)
neg.set_observations(wordfeats_train)
feats_train=TOPFeatures(10, pos, neg, False, False)
kernel=PolyKernel(feats_train, feats_train, *kargs)
km_train=kernel.get_kernel_matrix()
# get kernel on testing data
pos_clone=HMM(pos)
neg_clone=HMM(neg)
pos_clone.set_observations(wordfeats_test)
neg_clone.set_observations(wordfeats_test)
feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False)
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
fm_hmm_pos=[ fm_train_dna[i] for i in where([label_train_dna==1])[1] ]
fm_hmm_neg=[ fm_train_dna[i] for i in where([label_train_dna==-1])[1] ]
top()
# The WeightedCommWordString kernel may be used to compute the weighted
# spectrum kernel (i.e. a spectrum kernel for 1 to K-mers, where each k-mer
# length is weighted by some coefficient \f$\beta_k\f$) from strings that have
# been mapped into unsigned 16bit integers.
#
# These 16bit integers correspond to k-mers. To applicable in this kernel they
# need to be sorted (e.g. via the SortWordString pre-processor).
#
# It basically uses the algorithm in the unix "comm" command (hence the name)
# to compute:
#
# k({\bf x},({\bf x'})= \sum_{k=1}^K\beta_k\Phi_k({\bf x})\cdot \Phi_k({\bf x'})
#
# where \f$\Phi_k\f$ maps a sequence \f${\bf x}\f$ that consists of letters in
# \f$\Sigma\f$ to a feature vector of size \f$|\Sigma|^k\f$. In this feature
# vector each entry denotes how often the k-mer appears in that \f${\bf x}\f$.
#
# Note that this representation is especially tuned to small alphabets
# (like the 2-bit alphabet DNA), for which it enables spectrum kernels
# of order 8.
#
# For this kernel the linadd speedups are quite efficiently implemented using
# direct maps.
#
def weighted_comm_word_string ():
print 'WeightedCommWordString'
from shogun.Kernel import WeightedCommWordStringKernel
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.PreProc import SortWordString
order=3
gap=0
reverse=True
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
weighted_comm_word_string()
# The Weighted Degree Position String kernel (Weighted Degree kernel with shifts).
#
# The WD-shift kernel of order d compares two sequences X and
# Y of length L by summing all contributions of k-mer matches of
# lengths k in 1...d, weighted by coefficients beta_k
# allowing for a positional tolerance of up to shift s.
#
def weighted_degree_position_string ():
print 'WeightedDegreePositionString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import WeightedDegreePositionStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
degree=20
kernel=WeightedDegreePositionStringKernel(feats_train, feats_train, degree)
#kernel.set_shifts(zeros(len(data['train'][0]), dtype=int))
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
weighted_degree_position_string()
# This examples shows how to create a Weighted Degree String Kernel from data
# and how to compute the kernel matrix from the resulting object.
def weighted_degree_string ():
print 'WeightedDegreeString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import WeightedDegreeStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
degree=20
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
#weights=arange(1,degree+1,dtype=double)[::-1]/ \
# sum(arange(1,degree+1,dtype=double))
#kernel.set_wd_weights(weights)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
weighted_degree_string()
# In this example we show how to perform Multiple Kernel Learning (MKL)
# with the modular interface. First, we create a number of base kernels.
# These kernels can capture different views of the same features, or actually
# consider entirely different features associated with the same example
# (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample).
# The base kernels are then subsequently added to a CombinedKernel, which
# contains a weight for each kernel and encapsulates the base kernels
# from the training procedure. When the CombinedKernel between two examples is
# evaluated it computes the corresponding linear combination of kernels according to their weights.
# We then show how to create an MKLClassifier that trains an SVM and learns the optimal
# weighting of kernels (w.r.t. a given norm q) at the same time.
# Finally, the example shows how to classify with a trained MKLClassifier.
#
from shogun.Features import CombinedFeatures, RealFeatures, Labels
from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel
from shogun.Classifier import MKLClassification
def combined_custom():
##################################
# set up and train
# create some poly train/test matrix
tfeats = RealFeatures(fm_train_real)
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, tfeats)
K_train = tkernel.get_kernel_matrix()
pfeats = RealFeatures(fm_test_real)
tkernel.init(tfeats, pfeats)
K_test = tkernel.get_kernel_matrix()
# create combined train features
feats_train = CombinedFeatures()
feats_train.append_feature_obj(RealFeatures(fm_train_real))
# and corresponding combined kernel
kernel = CombinedKernel()
kernel.append_kernel(CustomKernel(K_train))
kernel.append_kernel(PolyKernel(10,2))
kernel.init(feats_train, feats_train)
# train mkl
labels = Labels(fm_label_twoclass)
mkl = MKLClassification()
# which norm to use for MKL
mkl.set_mkl_norm(1) #2,3
# set cost (neg, pos)
mkl.set_C(1, 1)
# set kernel and labels
mkl.set_kernel(kernel)
mkl.set_labels(labels)
# train
mkl.train()
#w=kernel.get_subkernel_weights()
#kernel.set_subkernel_weights(w)
##################################
# test
# create combined test features
feats_pred = CombinedFeatures()
feats_pred.append_feature_obj(RealFeatures(fm_test_real))
# and corresponding combined kernel
kernel = CombinedKernel()
kernel.append_kernel(CustomKernel(K_test))
kernel.append_kernel(PolyKernel(10, 2))
kernel.init(feats_train, feats_pred)
# and classify
mkl.set_kernel(kernel)
mkl.classify()
if __name__=='__main__':
from tools.load import LoadMatrix
lm = LoadMatrix()
fm_train_real = lm.load_numbers('../data/fm_train_real.dat')
fm_test_real = lm.load_numbers('../data/fm_test_real.dat')
fm_label_twoclass = lm.load_labels('../data/label_train_twoclass.dat')
fm_train_real.shape
fm_test_real.shape
combined_custom()
# In this example we show how to perform Multiple Kernel Learning (MKL)
# with the modular interface for multi-class classification.
# First, we create a number of base kernels and features.
# These kernels can capture different views of the same features, or actually
# consider entirely different features associated with the same example
# (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample).
# The base kernels are then subsequently added to a CombinedKernel, which
# contains a weight for each kernel and encapsulates the base kernels
# from the training procedure. When the CombinedKernel between two examples is
# evaluated it computes the corresponding linear combination of kernels according to their weights.
# We then show how to create an MKLMultiClass classifier that trains an SVM and learns the optimal
# weighting of kernels (w.r.t. a given norm q) at the same time. The main difference to the binary
# classification version of MKL is that we can use more than two values as labels, when training
# the classifier.
# Finally, the example shows how to classify with a trained MKLMultiClass classifier.
#
from shogun.Features import CombinedFeatures, RealFeatures, Labels
from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel
from shogun.Classifier import MKLMultiClass
def mkl_multiclass ():
print 'mkl_multiclass'
width = 1.2
C = 1.2
epsilon = 1e-5
num_threads = 1
kernel = CombinedKernel()
feats_train = CombinedFeatures()
feats_test = CombinedFeatures()
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = GaussianKernel(10, width)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = LinearKernel()
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = PolyKernel(10,2)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
labels = Labels(label_train_multiclass)
mkl = MKLMultiClass(C, kernel, labels)
mkl.set_epsilon(epsilon);
mkl.parallel.set_num_threads(num_threads)
mkl.set_mkl_epsilon(0.001)
mkl.set_mkl_norm(1.5)
mkl.train()
kernel.init(feats_train, feats_test)
out = mkl.classify().get_labels()
print out
if __name__ == '__main__':
from tools.load import LoadMatrix
lm = LoadMatrix()
fm_train_real = lm.load_numbers('../data/fm_train_real.dat')
fm_test_real = lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass = lm.load_labels('../data/label_train_multiclass.dat')
mkl_multiclass()
# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# LogPlusOne adds one to a dense real-valued vector and takes the logarithm of
# each component of it. It is most useful in situations where the inputs are
# counts: When one compares differences of small counts any difference may matter
# a lot, while small differences in large counts don't. This is what this log
# transformation controls for.
def log_plus_one ():
print 'LogPlusOne'
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
from shogun.PreProc import LogPlusOne
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=LogPlusOne()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
width=1.4
size_cache=10
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
log_plus_one()
# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# NormOne, normalizes vectors to have norm 1.
def norm_one ():
print 'NormOne'
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
from shogun.PreProc import NormOne
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=NormOne()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
width=1.4
size_cache=10
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
norm_one()
# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# PruneVarSubMean substracts the mean from each feature and removes features that
# have zero variance.
def prune_var_sub_mean ():
print 'PruneVarSubMean'
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
from shogun.PreProc import PruneVarSubMean
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=PruneVarSubMean()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
width=1.4
size_cache=10
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
prune_var_sub_mean()
# In this example a kernel matrix is computed for a given string data set. The
# CommUlongString kernel is used to compute the spectrum kernel from strings that
# have been mapped into unsigned 64bit integers. These 64bit integers correspond
# to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted.
# This is done using the SortUlongString preprocessor, which sorts the indivual
# strings in ascending order. The kernel function basically uses the algorithm in
# the unix "comm" command (hence the name). Note that this representation enables
# spectrum kernels of order 8 for 8bit alphabets (like binaries) and order 32 for
# 2-bit alphabets like DNA. For this kernel the linadd speedups are implemented
# (though there is room for improvement here when a whole set of sequences is
# ADDed) using sorted lists.
def sort_ulong_string ():
print 'CommUlongString'
from shogun.Kernel import CommUlongStringKernel
from shogun.Features import StringCharFeatures, StringUlongFeatures, DNA
from shogun.PreProc import SortUlongString
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringUlongFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringUlongFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortUlongString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
sort_ulong_string()
# In this example a kernel matrix is computed for a given string data set. The
# CommWordString kernel is used to compute the spectrum kernel from strings that
# have been mapped into unsigned 16bit integers. These 16bit integers correspond
# to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted.
# This is done using the SortWordString preprocessor, which sorts the indivual
# strings in ascending order. The kernel function basically uses the algorithm in
# the unix "comm" command (hence the name). Note that this representation is
# especially tuned to small alphabets (like the 2-bit alphabet DNA), for which it
# enables spectrum kernels of order up to 8. For this kernel the linadd speedups
# are quite efficiently implemented using direct maps.
def sort_word_string ():
print 'CommWordString'
from shogun.Kernel import CommWordStringKernel
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=CommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
sort_word_string()
# In this example a kernelized version of ridge regression (KRR) is trained on a
# real-valued data set. The KRR is trained with regularization parameter tau=1e-6
# and a gaussian kernel with width=0.8. The labels of both the train and the test
# data can be fetched via krr.classify().get_labels().
###########################################################################
# kernel ridge regression
###########################################################################
def krr ():
print 'KRR'
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Regression import KRR
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
width=0.8
kernel=GaussianKernel(feats_train, feats_train, width)
tau=1e-6
labels=Labels(label_train)
krr=KRR(tau, kernel, labels)
krr.train(feats_train)
kernel.init(feats_train, feats_test)
out = krr.classify().get_labels()
return out
# equivialent shorter version
def krr_short ():
print 'KRR_short'
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Regression import KRR
width=0.8; tau=1e-6
krr=KRR(tau, GaussianKernel(0, width), Labels(label_train))
krr.train(RealFeatures(fm_train))
out = krr.classify(RealFeatures(fm_test)).get_labels()
return out
if __name__=='__main__':
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train=lm.load_numbers('../data/fm_train_real.dat')
fm_test=lm.load_numbers('../data/fm_test_real.dat')
label_train=lm.load_labels('../data/label_train_twoclass.dat')
out1=krr()
out2=krr_short()
# In this example a support vector regression algorithm is trained on a
# real-valued toy data set. The underlying library used for the SVR training is
# LIBSVM. The SVR is trained with regularization parameter C=1 and a gaussian
# kernel with width=2.1. The labels of both the train and the test data are
# fetched via svr.classify().get_labels().
#
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ .
def libsvr ():
print 'LibSVR'
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Regression import LibSVR
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
tube_epsilon=1e-2
labels=Labels(label_train)
svr=LibSVR(C, epsilon, kernel, labels)
svr.set_tube_epsilon(tube_epsilon)
svr.train()
kernel.init(feats_train, feats_test)
out1=svr.classify().get_labels()
out2=svr.classify(feats_test).get_labels()
if __name__=='__main__':
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train=lm.load_numbers('../data/fm_train_real.dat')
fm_test=lm.load_numbers('../data/fm_test_real.dat')
label_train=lm.load_labels('../data/label_train_twoclass.dat')
libsvr()
# In this example a support vector regression algorithm is trained on a
# real-valued toy data set. The underlying library used for the SVR training is
# SVM^light. The SVR is trained with regularization parameter C=1 and a gaussian
# kernel with width=2.1. The the label of both the train and the test data are
# fetched via svr.classify().get_labels().
#
# For more details on the SVM^light see
# T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
# Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
###########################################################################
# svm light based support vector regression
###########################################################################
def svr_light ():
print 'SVRLight'
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
try:
from shogun.Regression import SVRLight
except ImportError:
print 'No support for SVRLight available.'
return
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
tube_epsilon=1e-2
num_threads=3
labels=Labels(label_train)
svr=SVRLight(C, epsilon, kernel, labels)
svr.set_tube_epsilon(tube_epsilon)
svr.parallel.set_num_threads(num_threads)
svr.train()
kernel.init(feats_train, feats_test)
svr.classify().get_labels()
if __name__=='__main__':
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train=lm.load_numbers('../data/fm_train_real.dat')
fm_test=lm.load_numbers('../data/fm_test_real.dat')
label_train=lm.load_labels('../data/label_train_twoclass.dat')
svr_light()
# This example shows how to use boost serialization (only available if the compile flag was enabled)
# to serialize/deserialize an SVMLight object. Note that this code is in alpha state.
from shogun.Features import *
from shogun.Library import MSG_DEBUG
from shogun.Features import StringCharFeatures, Labels, DNA, Alphabet
from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
from shogun.Classifier import SVMLight
from numpy import *
from numpy.random import randn
import sys
import types
import random
import bz2
import cPickle
import inspect
def save(filename, myobj):
"""
save object to file using pickle
@param filename: name of destination file
@type filename: str
@param myobj: object to save (has to be pickleable)
@type myobj: obj
"""
try:
f = bz2.BZ2File(filename, 'wb')
except IOError, details:
sys.stderr.write('File ' + filename + ' cannot be written\n')
sys.stderr.write(details)
return
cPickle.dump(myobj, f, protocol=2)
f.close()
def load(filename):
"""
Load from filename using pickle
@param filename: name of file to load from
@type filename: str
"""
try:
f = bz2.BZ2File(filename, 'rb')
except IOError, details:
sys.stderr.write('File ' + filename + ' cannot be read\n')
sys.stderr.write(details)
return
myobj = cPickle.load(f)
f.close()
return myobj
##################################################
num=10
dist=1
width=2.1
traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1)
testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1);
trainlab=concatenate((-ones(num), ones(num)));
testlab=concatenate((-ones(num), ones(num)));
feats_train=RealFeatures(traindata_real);
feats_test=RealFeatures(testdata_real);
kernel=GaussianKernel(feats_train, feats_train, width);
kernel.io.set_loglevel(MSG_DEBUG)
labels=Labels(trainlab);
svm=SVMLight(2, kernel, labels)
svm.train()
svm.io.set_loglevel(MSG_DEBUG)
##################################################
print "labels:"
print labels.to_string()
print "features"
print feats_train.to_string()
print "kernel"
print kernel.to_string()
print "svm"
print svm.to_string()
print "#################################"
fn = "serialized_svm.bz2"
print "serializing SVM to file", fn
save(fn, svm)
print "#################################"
print "unserializing SVM"
svm2 = load(fn)
print "#################################"
print "comparing training"
svm2.train()
print "objective before serialization:", svm.get_objective()
print "objective after serialization:", svm2.get_objective()
# In this example we use the dynamic progaramm implementation with a
# gene finding specific model. The model and the training parameter
# are stored in a file and are used to create a gene prediction on
# some example sequence.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from shogun.Structure import *
import numpy
from numpy import array,Inf,float64,matrix,frompyfunc,zeros
#from IPython.Shell import IPShellEmbed
#ipshell = IPShellEmbed()
import gzip
import scipy
from scipy.io import loadmat
import pickle
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
if scipy.__version__ >= '0.7.0':
renametable = {
'scipy.io.mio5': 'scipy.io.matlab.mio5',
'scipy.sparse.sparse' : 'scipy.sparse',
}
else:
renametable = {}
def mapname(name):
if name in renametable:
return renametable[name]
return name
def mapped_load_global(self):
module = mapname(self.readline()[:-1])
name = mapname(self.readline()[:-1])
klass = self.find_class(module, name)
self.append(klass)
def loads(str):
file = StringIO(str)
unpickler = pickle.Unpickler(file)
unpickler.dispatch[pickle.GLOBAL] = mapped_load_global
return unpickler.load()
def run_test():
data_dict = loads(gzip.GzipFile('../data/DynProg_example_py.pickle.gz').read())
#data_dict = loadmat('../data/DynProg_example_py.dat.mat', appendmat=False, struct_as_record=False)
#print data_dict
#print len(data_dict['penalty_array'][0][0][0][0].limits[0])
num_plifs,num_limits = len(data_dict['penalty_array']),len(data_dict['penalty_array'][0].limits)
pm = PlifMatrix()
pm.create_plifs(num_plifs,num_limits)
ids = numpy.array(range(num_plifs),dtype=numpy.int32)
min_values = numpy.array(range(num_plifs),dtype=numpy.float64)
max_values = numpy.array(range(num_plifs),dtype=numpy.float64)
all_use_cache = numpy.array(range(num_plifs),dtype=numpy.bool)
all_use_svm = numpy.array(range(num_plifs),dtype=numpy.int32)
all_limits = zeros((num_plifs,num_limits))
all_penalties = zeros((num_plifs,num_limits))
all_names = ['']*num_plifs
all_transforms = ['']*num_plifs
for plif_idx in range(num_plifs):
ids[plif_idx] = data_dict['penalty_array'][plif_idx].id-1
min_values[plif_idx] = data_dict['penalty_array'][plif_idx].min_value
max_values[plif_idx] = data_dict['penalty_array'][plif_idx].max_value
all_use_cache[plif_idx] = data_dict['penalty_array'][plif_idx].use_cache
all_use_svm[plif_idx] = data_dict['penalty_array'][plif_idx].use_svm
all_limits[plif_idx] = data_dict['penalty_array'][plif_idx].limits
all_penalties[plif_idx] = data_dict['penalty_array'][plif_idx].penalties
all_names[plif_idx] = str(data_dict['penalty_array'][plif_idx].name)
all_transforms[plif_idx] = str(data_dict['penalty_array'][plif_idx].transform)
if all_transforms[plif_idx] == '[]':
all_transforms[plif_idx] = 'linear'
pm.set_plif_ids(ids)
pm.set_plif_min_values(min_values)
pm.set_plif_max_values(max_values)
pm.set_plif_use_cache(all_use_cache)
pm.set_plif_use_svm(all_use_svm)
pm.set_plif_limits(all_limits)
pm.set_plif_penalties(all_penalties)
#pm.set_plif_names(all_names)
#pm.set_plif_transform_type(all_transforms)
transition_ptrs = data_dict['model'].transition_pointers
transition_ptrs = transition_ptrs[:,:,0:2]
transition_ptrs = transition_ptrs.astype(numpy.float64)
pm.compute_plif_matrix(transition_ptrs)
# init_dyn_prog
num_svms = 8
dyn = DynProg(num_svms)
orf_info = data_dict['model'].orf_info
orf_info = orf_info.astype(numpy.int32)
num_states = orf_info.shape[0]
dyn.set_num_states(num_states)
block = data_dict['block']
seq_len = len(block.seq)
seq = str(block.seq)
gene_string = array([elem for elem in seq])
# precompute_content_svms
pos = block.all_pos-1
pos = pos.astype(numpy.int32)
snd_pos = pos
dyn.set_pos(pos)
dyn.set_gene_string(gene_string)
dyn.create_word_string()
dyn.precompute_stop_codons()
dyn.init_content_svm_value_array(num_svms)
dict_weights = data_dict['content_weights']
dict_weights = dict_weights.reshape(8,1).astype(numpy.float64)
dict_weights = zeros((8,5440))
dyn.set_dict_weights(dict_weights.T)
dyn.precompute_content_values()
dyn.init_mod_words_array(data_dict['model'].mod_words.astype(numpy.int32))
pm.compute_signal_plifs(data_dict['state_signals'].astype(numpy.int32))
dyn.set_orf_info(orf_info)
#
p = data_dict['model'].p
q = data_dict['model'].q
dyn.set_p_vector(p)
dyn.set_q_vector(q)
a_trans = data_dict['a_trans']
a_trans = a_trans.astype(float64)
dyn.set_a_trans_matrix(a_trans)
dyn.check_svm_arrays()
features = data_dict['block'].features
dyn.set_observation_matrix(features)
dyn.set_content_type_array(data_dict['seg_path'].astype(numpy.float64))
dyn.best_path_set_segment_loss(data_dict['loss'].astype(numpy.float64))
use_orf = True
feat_dims = [25,201,2]
dyn.set_plif_matrices(pm);
dyn.compute_nbest_paths(features.shape[2], use_orf, 1,True,False)
# fetch results
states = dyn.get_states()
print states
scores = dyn.get_scores()
print scores
positions = dyn.get_positions()
print positions
if __name__ == '__main__':
run_test()
import gc
from shogun.Features import Alphabet,StringCharFeatures,StringWordFeatures,DNA
from shogun.PreProc import SortWordString, MSG_DEBUG
from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer
from numpy import mat
POS=[100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT']
NEG=[100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT']
order=7
gap=0
reverse=False
for i in xrange(10):
alpha=Alphabet(DNA)
traindat=StringCharFeatures(alpha)
traindat.set_features(POS+NEG)
trainudat=StringWordFeatures(traindat.get_alphabet());
trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
#trainudat.io.set_loglevel(MSG_DEBUG)
pre = SortWordString()
#pre.io.set_loglevel(MSG_DEBUG)
pre.init(trainudat)
trainudat.add_preproc(pre)
trainudat.apply_preproc()
spec = CommWordStringKernel(10, False)
spec.set_normalizer(IdentityKernelNormalizer())
spec.init(trainudat, trainudat)
K=mat(spec.get_kernel_matrix())
del POS
del NEG
del order
del gap
del reverse