本页面包含了所有Python模块化接口的例子。
要运行这些例子只需要
python name_of_example.py
from numpy import * from numpy.random import rand from shogun.Features import RealFeatures, Labels from shogun.Kernel import CustomKernel from shogun.Classifier import LibSVM C=1 dim=7 lab=sign(2*rand(dim) - 1) data=rand(dim, dim) symdata=data*data.T kernel=CustomKernel() kernel.set_full_kernel_matrix_from_full(data) labels=Labels(lab) svm=LibSVM(C, kernel, labels) svm.train() out=svm.classify().get_labels()
import numpy from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel from shogun.Classifier import SVMLight, DomainAdaptationSVM degree=3 fm_train_dna = ['CGCACGTACGTAGCTCGAT', 'CGACGTAGTCGTAGTCGTA', 'CGACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACCACAGTTATATAGTA', 'CGACGTAGTCGTAGTCGTA', 'CGACGTAGTTTTTTTCGTA', 'CGACGTAGTCGTAGCCCCA', 'CAAAAAAAAAAAAAAAATA', 'CGACGGGGGGGGGGGCGTA'] label_train_dna = numpy.array(5*[-1.0] + 5*[1.0]) fm_test_dna = ['AGCACGTACGTAGCTCGAT', 'AGACGTAGTCGTAGTCGTA', 'CAACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGAACACAGTTATATAGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACGTGGGGTTTTTCGTA', 'CGACGTAGTCCCAGCCCCA', 'CAAAAAAAAAAAACCAATA', 'CGACGGCCGGGGGGGCGTA'] label_test_dna = numpy.array(5*[-1.0] + 5*[1.0]) fm_train_dna2 = ['AGACAGTCAGTCGATAGCT', 'AGCAGTCGTAGTCGTAGTC', 'AGCAGGGGGGGGGGTAGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCAACACGTTCTCTCGTC', 'AGCAGTCGTAGTCGTAGTC', 'AGCAGTCGTTTTTTTAGTC', 'AGCAGTCGTAGTCGAAAAC', 'ACCCCCCCCCCCCCCCCTC', 'AGCAGGGGGGGGGGGAGTC'] label_train_dna2 = numpy.array(5*[-1.0] + 5*[1.0]) fm_test_dna2 = ['CGACAGTCAGTCGATAGCT', 'CGCAGTCGTAGTCGTAGTC', 'ACCAGGGGGGGGGGTAGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCCACACGTTCTCTCGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCAGTGGGGTTTTTAGTC', 'AGCAGTCGTAAACGAAAAC', 'ACCCCCCCCCCCCAACCTC', 'AGCAGGAAGGGGGGGAGTC'] label_test_dna2 = numpy.array(5*[-1.0] + 5*[1.0]) C = 1.0 feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = Labels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.train() ##################################### print "obtaining DA SVM from previously trained SVM" feats_train2 = StringCharFeatures(fm_train_dna, DNA) feats_test2 = StringCharFeatures(fm_test_dna, DNA) kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels2 = Labels(label_train_dna) # we regularize against the previously obtained solution dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0) dasvm.train() out = dasvm.classify(feats_test2).get_labels() print out
def gmnpsvm ():
print 'GMNPSVM'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import GMNPSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_multiclass)
svm=GMNPSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train(feats_train)
#kernel.init(feats_train, feats_test)
out=svm.classify(feats_test).get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat')
gmnpsvm()
def gpbtsvm ():
print 'GPBTSVM'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import GPBTSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_twoclass)
svm=GPBTSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
gpbtsvm()
def knn ():
print 'KNN'
from shogun.Features import RealFeatures, Labels
from shogun.Classifier import KNN
from shogun.Distance import EuclidianDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclidianDistance(feats_train, feats_train)
k=3
labels=Labels(label_train_multiclass)
knn=KNN(k, distance, labels)
knn.train()
output=knn.classify(feats_test).get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat')
knn()
def larank ():
print 'LaRank'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LaRank
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_multiclass)
svm=LaRank(C, kernel, labels)
#svm.set_tau(1e-3)
#svm.set_batch_mode(False)
#svm.io.enable_progress()
svm.set_epsilon(epsilon)
svm.train()
out=svm.classify(feats_train).get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat')
larank()
def lda ():
print 'LDA'
from shogun.Features import RealFeatures, Labels
from shogun.Classifier import LDA
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
gamma=3
num_threads=1
labels=Labels(label_train_twoclass)
lda=LDA(gamma, feats_train, labels)
lda.train()
lda.get_bias()
lda.get_w()
lda.set_features(feats_test)
lda.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
lda()
def liblinear ():
print 'LibLinear'
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import LibLinear
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
C=0.9
epsilon=1e-5
num_threads=1
labels=Labels(label_train_twoclass)
svm=LibLinear(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(True)
svm.train()
svm.set_features(feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
liblinear()
from numpy import * from numpy.random import randn from shogun.Features import * from shogun.Classifier import * from shogun.Kernel import * num=1000 dist=1 width=2.1 C=1 traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab=concatenate((-ones(num), ones(num))); testlab=concatenate((-ones(num), ones(num))); feats_train=RealFeatures(traindata_real); feats_test=RealFeatures(testdata_real); kernel=GaussianKernel(feats_train, feats_train, width); labels=Labels(trainlab); svm=LibSVM(C, kernel, labels); svm.train(); kernel.init(feats_train, feats_test); out=svm.classify().get_labels(); testerr=mean(sign(out)!=testlab) print testerr
def libsvm ():
print 'LibSVM'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LibSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_twoclass)
svm=LibSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
sv_idx=svm.get_support_vectors()
alphas=svm.get_alphas()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
libsvm()
def libsvm_multiclass ():
print 'LibSVMMultiClass'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LibSVMMultiClass
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_multiclass)
svm=LibSVMMultiClass(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat')
libsvm_multiclass()
def libsvm_oneclass ():
print 'LibSVMOneClass'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LibSVMOneClass
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
svm=LibSVMOneClass(C, kernel)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
libsvm_oneclass()
def mpdsvm ():
print 'MPDSVM'
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import MPDSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_twoclass)
svm=MPDSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
mpdsvm()
def perceptron ():
print 'Perceptron'
from shogun.Features import RealFeatures, Labels
from shogun.Classifier import Perceptron
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
learn_rate=1.
max_iter=1000
num_threads=1
labels=Labels(label_train_twoclass)
perceptron=Perceptron(feats_train, labels)
perceptron.set_learn_rate(learn_rate)
perceptron.set_max_iter(max_iter)
# only guaranteed to converge for separable data
perceptron.train()
perceptron.set_features(feats_test)
perceptron.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
perceptron()
def subgradient_svm ():
print 'SubGradientSVM'
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SubGradientSVM
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
C=0.9
epsilon=1e-3
num_threads=1
max_train_time=1.
labels=Labels(label_train_twoclass)
svm=SubGradientSVM(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(False)
svm.set_max_train_time(max_train_time)
svm.train()
svm.set_features(feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
subgradient_svm()
def do_batch_linadd ():
print 'SVMlight batch'
from shogun.Features import StringCharFeatures, Labels, DNA
from shogun.Kernel import WeightedDegreeStringKernel
try:
from shogun.Classifier import SVMLight
except ImportError:
print 'No support for SVMLight available.'
return
feats_train=StringCharFeatures(DNA)
feats_train.set_features(fm_train_dna)
feats_test=StringCharFeatures(DNA)
feats_test.set_features(fm_test_dna)
degree=20
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
C=1
epsilon=1e-5
num_threads=2
labels=Labels(label_train_dna)
svm=SVMLight(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.train()
kernel.init(feats_train, feats_test)
#print 'SVMLight Objective: %f num_sv: %d' % \
# (svm.get_objective(), svm.get_num_support_vectors())
svm.set_batch_computation_enabled(False)
svm.set_linadd_enabled(False)
svm.classify().get_labels()
svm.set_batch_computation_enabled(True)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
do_batch_linadd()
import numpy degree=3 fm_train_dna=['CGCACGTACGTAGCTCGAT', 'CGACGTAGTCGTAGTCGTA', 'CGACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACCACAGTTATATAGTA', 'CGACGTAGTCGTAGTCGTA', 'CGACGTAGTTTTTTTCGTA', 'CGACGTAGTCGTAGCCCCA', 'CAAAAAAAAAAAAAAAATA', 'CGACGGGGGGGGGGGCGTA'] label_train_dna=numpy.array(5*[-1.0] + 5*[1.0]) fm_test_dna=['AGCACGTACGTAGCTCGAT', 'AGACGTAGTCGTAGTCGTA', 'CAACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGAACACAGTTATATAGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACGTGGGGTTTTTCGTA', 'CGACGTAGTCCCAGCCCCA', 'CAAAAAAAAAAAACCAATA', 'CGACGGCCGGGGGGGCGTA'] label_test_dna=numpy.array(5*[-1.0] + 5*[1.0]) print 'SVMLight' from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel from shogun.Classifier import SVMLight feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) C=10 epsilon=1e-5 num_threads=1 labels=Labels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double)); svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.classify().get_labels()
def svm_light ():
print 'SVMLight'
from shogun.Features import StringCharFeatures, Labels, DNA
from shogun.Kernel import WeightedDegreeStringKernel
try:
from shogun.Classifier import SVMLight
except ImportError:
print 'No support for SVMLight available.'
return
feats_train=StringCharFeatures(DNA)
feats_train.set_features(fm_train_dna)
feats_test=StringCharFeatures(DNA)
feats_test.set_features(fm_test_dna)
degree=20
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
C=1.2
epsilon=1e-5
num_threads=1
labels=Labels(label_train_dna)
svm=SVMLight(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
svm_light()
def svmlin ():
print 'SVMLin'
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SVMLin
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
C=0.9
epsilon=1e-5
num_threads=1
labels=Labels(label_train_twoclass)
svm=SVMLin(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(True)
svm.train()
svm.set_features(feats_test)
svm.get_bias()
svm.get_w()
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
svmlin()
def svmocas ():
print 'SVMOcas'
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SVMOcas
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
C=0.9
epsilon=1e-5
num_threads=1
labels=Labels(label_train_twoclass)
svm=SVMOcas(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(False)
svm.train()
svm.set_features(feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
svmocas()
def svmsgd ():
print 'SVMSGD'
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SVMSGD
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
C=0.9
num_threads=1
num_iter=5
labels=Labels(label_train_twoclass)
svm=SVMSGD(C, feats_train, labels)
svm.set_epochs(num_iter)
#svm.io.set_loglevel(0)
svm.train()
svm.set_features(feats_test)
svm.classify().get_labels()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat')
svmsgd()
def hierarchical ():
print 'Hierarchical'
from shogun.Distance import EuclidianDistance
from shogun.Features import RealFeatures
from shogun.Clustering import Hierarchical
merges=3
feats_train=RealFeatures(fm_train)
distance=EuclidianDistance(feats_train, feats_train)
hierarchical=Hierarchical(merges, distance)
hierarchical.train()
hierarchical.get_merge_distances()
hierarchical.get_cluster_pairs()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train=lm.load_numbers('../data/fm_train_real.dat')
hierarchical()
#!/usr/bin/env python
"""
Explicit examples on how to use clustering
"""
def kmeans ():
print 'KMeans'
from shogun.Distance import EuclidianDistance
from shogun.Features import RealFeatures
from shogun.Clustering import KMeans
k=3
feats_train=RealFeatures(fm_train)
distance=EuclidianDistance(feats_train, feats_train)
kmeans=KMeans(k, distance)
kmeans.train()
kmeans.get_cluster_centers()
kmeans.get_radiuses()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train=lm.load_numbers('../data/fm_train_real.dat')
kmeans()
def bray_curtis_distance ():
print 'BrayCurtisDistance'
from shogun.Features import RealFeatures
from shogun.Distance import BrayCurtisDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=BrayCurtisDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
bray_curtis_distance()
def canberra_metric ():
print 'CanberaMetric'
from shogun.Features import RealFeatures
from shogun.Distance import CanberraMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=CanberraMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
canberra_metric()
def canberra_word_distance ():
print 'CanberraWordDistance'
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
from shogun.Distance import CanberraWordDistance
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
distance=CanberraWordDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
canberra_word_distance()
def chebyshew_metric ():
print 'ChebyshewMetric'
from shogun.Features import RealFeatures
from shogun.Distance import ChebyshewMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=ChebyshewMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
chebyshew_metric()
def chi_square_distance ():
print 'ChiSquareDistance'
from shogun.Features import RealFeatures
from shogun.Distance import ChiSquareDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=ChiSquareDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
chi_square_distance()
def cosine_distance ():
print 'CosineDistance'
from shogun.Features import RealFeatures
from shogun.Distance import CosineDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=CosineDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
cosine_distance()
def euclidian_distance ():
print 'EuclidianDistance'
from shogun.Features import RealFeatures
from shogun.Distance import EuclidianDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclidianDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
euclidian_distance()
def geodesic_metric ():
print 'GeodesicMetric'
from shogun.Features import RealFeatures
from shogun.Distance import GeodesicMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=GeodesicMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
geodesic_metric()
def hamming_word_distance ():
print 'HammingWordDistance'
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
from shogun.Distance import HammingWordDistance
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
distance=HammingWordDistance(feats_train, feats_train, use_sign)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
hamming_word_distance()
def jensen_metric ():
print 'JensenMetric'
from shogun.Features import RealFeatures
from shogun.Distance import JensenMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=JensenMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
jensen_metric()
def manhattan_metric ():
print 'ManhattanMetric'
from shogun.Features import RealFeatures
from shogun.Distance import ManhattanMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=ManhattanMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
manhattan_metric()
def manhattan_word_distance ():
print 'ManhattanWordDistance'
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
from shogun.Distance import ManhattanWordDistance
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
distance=ManhattanWordDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
manhattan_word_distance()
def minkowski_metric ():
print 'MinkowskiMetric'
from shogun.Features import RealFeatures
from shogun.Distance import MinkowskiMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
k=3
distance=MinkowskiMetric(feats_train, feats_train, k)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
minkowski_metric()
def norm_squared_distance ():
from shogun.Features import RealFeatures
from shogun.Distance import EuclidianDistance
print 'EuclidianDistance - NormSquared'
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclidianDistance(feats_train, feats_train)
distance.set_disable_sqrt(True)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
norm_squared_distance()
def sparse_euclidian_distance ():
print 'SparseEuclidianDistance'
from shogun.Features import RealFeatures, SparseRealFeatures
from shogun.Distance import SparseEuclidianDistance
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
distance=SparseEuclidianDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
sparse_euclidian_distance()
def tanimoto_distance ():
print 'TanimotoDistance'
from shogun.Features import RealFeatures
from shogun.Distance import TanimotoDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=TanimotoDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
tanimoto_distance()
def histogram ():
print 'Histogram'
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.Distribution import Histogram
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_dna)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
histo=Histogram(feats)
histo.train()
histo.get_histogram()
num_examples=feats.get_num_vectors()
num_param=histo.get_num_model_parameters()
#for i in xrange(num_examples):
# for j in xrange(num_param):
# histo.get_log_derivative(j, i)
histo.get_log_likelihood()
histo.get_log_likelihood_sample()
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_dna=lm.load_dna('../data/fm_train_dna.dat')
histogram()
def hmm ():
print 'HMM'
from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE
from shogun.Distribution import HMM, BW_NORMAL
N=3
M=6
pseudo=1e-1
order=1
gap=0
reverse=False
num_examples=2
charfeat=StringCharFeatures(CUBE)
charfeat.set_features(fm_cube)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
hmm=HMM(feats, N, M, pseudo)
hmm.train()
hmm.baum_welch_viterbi_train(BW_NORMAL)
num_examples=feats.get_num_vectors()
num_param=hmm.get_num_model_parameters()
for i in xrange(num_examples):
for j in xrange(num_param):
hmm.get_log_derivative(j, i)
best_path=0
best_path_state=0
for i in xrange(num_examples):
best_path+=hmm.best_path(i)
for j in xrange(N):
best_path_state+=hmm.get_best_path_state(i, j)
hmm.get_log_likelihood()
hmm.get_log_likelihood_sample()
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_cube=lm.load_cubes('../data/fm_train_cube.dat')
hmm()
def linear_hmm ():
print 'LinearHMM'
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.Distribution import LinearHMM
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_dna)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
hmm=LinearHMM(feats)
hmm.train()
hmm.get_transition_probs()
num_examples=feats.get_num_vectors()
num_param=hmm.get_num_model_parameters()
for i in xrange(num_examples):
for j in xrange(num_param):
hmm.get_log_derivative(j, i)
hmm.get_log_likelihood()
hmm.get_log_likelihood_sample()
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_dna=lm.load_dna('../data/fm_train_dna.dat')
linear_hmm()
def io ():
print 'Features IO'
import numpy
from shogun.Features import SparseRealFeatures, RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Library import AsciiFile, BinaryFile, HDF5File
feats=SparseRealFeatures(fm_train_real)
feats2=SparseRealFeatures()
f=BinaryFile("fm_train_sparsereal.bin","w")
feats.save(f)
f=AsciiFile("fm_train_sparsereal.ascii","w")
feats.save(f)
f=BinaryFile("fm_train_sparsereal.bin")
feats2.load(f)
f=AsciiFile("fm_train_sparsereal.ascii")
feats2.load(f)
feats=RealFeatures(fm_train_real)
feats2=RealFeatures()
f=BinaryFile("fm_train_real.bin","w")
feats.save(f)
f=HDF5File("fm_train_real.h5","w", "/data/doubles")
feats.save(f)
f=AsciiFile("fm_train_real.ascii","w")
feats.save(f)
f=BinaryFile("fm_train_real.bin")
feats2.load(f)
print "diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))
f=AsciiFile("fm_train_real.ascii")
feats2.load(f)
print "diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))
lab=Labels(numpy.array([1.0,2.0,3.0]))
lab2=Labels()
f=AsciiFile("label_train_twoclass.ascii","w")
lab.save(f)
f=BinaryFile("label_train_twoclass.bin","w")
lab.save(f)
f=HDF5File("fm_train_real.h5","a", "/data/labels")
lab.save(f)
f=AsciiFile("label_train_twoclass.ascii")
lab2.load(f)
f=BinaryFile("label_train_twoclass.bin")
lab2.load(f)
f=HDF5File("fm_train_real.h5","r", "/data/doubles")
feats2.load(f)
print feats2.get_feature_matrix()
f=HDF5File("fm_train_real.h5","r", "/data/labels")
lab2.load(f)
print lab2.get_labels()
#clean up
import os
for f in ['fm_train_sparsereal.bin','fm_train_sparsereal.ascii',
'fm_train_real.bin','fm_train_real.h5','fm_train_real.ascii',
'label_train_twoclass.ascii','label_train_twoclass.bin']:
os.unlink(f)
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
label_train_twoclass=lm.load_numbers('../data/label_train_twoclass.dat')
io()
import os
from shogun.Features import SparseRealFeatures
f=SparseRealFeatures()
lab=f.load_svmlight_file('../data/train_sparsereal.light')
f.write_svmlight_file('testwrite.light', lab)
os.unlink('testwrite.light')
from shogun.Features import ByteFeatures from numpy import array, uint8, all # create dense matrix A A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8) # ... of type Byte a=ByteFeatures(A) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=uint8), 0) # get matrix a_out = a.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A))
from shogun.Features import LongIntFeatures from numpy import array, int64, all # create dense matrix A A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64) # ... of type LongInt a=LongIntFeatures(A) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=int64), 0) # get matrix a_out = a.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A))
from shogun.Features import RealFeatures, LongIntFeatures, ByteFeatures from numpy import array, float64, int64, uint8, all # create dense matrices A,B,C A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) B=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64) C=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8) # ... of type Real, LongInt and Byte a=RealFeatures(A) b=LongIntFeatures(B) c=ByteFeatures(C) # or 16bit wide ... #feat1 = f.ShortFeatures(N.zeros((10,5),N.short)) #feat2 = f.WordFeatures(N.zeros((10,5),N.uint16)) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrices a_out = a.get_feature_matrix() b_out = b.get_feature_matrix() c_out = c.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A)) print type(b_out), b_out.dtype print b_out assert(all(b_out==B)) print type(c_out), c_out.dtype print c_out assert(all(c_out==C))
from shogun.Features import RealFeatures from numpy import array, float64, all # create dense matrices A,B,C A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) # ... of type Real, LongInt and Byte a=RealFeatures(A) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrix a_out = a.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A))
from shogun.Features import *
from numpy import *
sf=StringByteFeatures(DIGIT2)
sf.load_ascii_file('x', False, DIGIT2, DIGIT2)
print sf.get_features()
snps=SNPFeatures(sf)
print snps.get_feature_matrix()
print snps.get_minor_base_string()
print snps.get_major_base_string()
from scipy.sparse import csc_matrix from shogun.Features import SparseRealFeatures from numpy import array, float64, all # create dense matrix A and its sparse representation X # note, will work with types other than float64 too, # but requires recent scipy.sparse A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) X=csc_matrix(A) print A # create sparse shogun features from dense matrix A a=SparseRealFeatures(A) a_out=a.get_full_feature_matrix() print a_out assert(all(a_out==A)) print a_out # create sparse shogun features from sparse matrix X a.set_sparse_feature_matrix(X) a_out=a.get_full_feature_matrix() print a_out assert(all(a_out==A)) # create sparse shogun features from sparse matrix X a=SparseRealFeatures(X) a_out=a.get_full_feature_matrix() print a_out assert(all(a_out==A)) # obtain (data,row,indptr) csc arrays of sparse shogun features z=csc_matrix(a.get_sparse_feature_matrix()) z_out=z.todense() print z_out assert(all(z_out==A))
from shogun.Features import StringCharFeatures, StringFileCharFeatures, RAWBYTE
from shogun.Library import UNCOMPRESSED,LZO,GZIP,BZIP2,LZMA, MSG_DEBUG
from shogun.PreProc import DecompressCharString
f=StringFileCharFeatures('features_string_char_compressed_modular.py', RAWBYTE)
print "original strings", f.get_features()
#uncompressed
f.save_compressed("foo_uncompressed.str", UNCOMPRESSED, 1)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_uncompressed.str", True)
print "uncompressed strings", f2.get_features()
print
# load compressed data and uncompress on load
#lzo
f.save_compressed("foo_lzo.str", LZO, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzo.str", True)
print "lzo strings", f2.get_features()
print
##gzip
f.save_compressed("foo_gzip.str", GZIP, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_gzip.str", True)
print "gzip strings", f2.get_features()
print
#bzip2
f.save_compressed("foo_bzip2.str", BZIP2, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_bzip2.str", True)
print "bzip2 strings", f2.get_features()
print
#lzma
f.save_compressed("foo_lzma.str", LZMA, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzma.str", True)
print "lzma strings", f2.get_features()
print
# load compressed data and uncompress via preprocessor
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzo.str", False)
f2.add_preproc(DecompressCharString(LZO))
f2.apply_preproc()
print "lzo strings", f2.get_features()
print
# load compressed data and uncompress on-the-fly via preprocessor
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzo.str", False)
f2.io.set_loglevel(MSG_DEBUG)
f2.add_preproc(DecompressCharString(LZO))
f2.enable_on_the_fly_preprocessing()
print "lzo strings", f2.get_features()
print
#clean up
import os
for f in ['foo_uncompressed.str', 'foo_lzo.str', 'foo_gzip.str',
'foo_bzip2.str', 'foo_lzma.str', 'foo_lzo.str', 'foo_lzo.str']:
if os.path.exists(f):
os.unlink(f)
##########################################################################################
# some perfectly compressible stuff follows
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
from shogun.Features import StringCharFeatures, RAWBYTE from numpy import array #create string features f=StringCharFeatures(['hey','guys','i','am','a','string'], RAWBYTE) #and output several stats print "max string length", f.get_max_vector_length() print "number of strings", f.get_num_vectors() print "length of first string", f.get_vector_length(0) print "string[5]", ''.join(f.get_feature_vector(5)) print "strings", f.get_features() #replace string 0 f.set_feature_vector(array(['t','e','s','t']), 0) print "strings", f.get_features()
from shogun.Features import StringFileCharFeatures, RAWBYTE
f = StringFileCharFeatures('features_string_file_char_modular.py', RAWBYTE)
print "strings", f.get_features()
from shogun.Features import StringCharFeatures, RAWBYTE
# load features from directory
f=StringCharFeatures(RAWBYTE)
f.load_from_directory(".")
#and output several stats
print "max string length", f.get_max_vector_length()
print "number of strings", f.get_num_vectors()
print "length of first string", f.get_vector_length(0)
print "str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2)
print "len(str[0])", f.get_vector_length(0)
print "str[0]", f.get_feature_vector(0)
#or load features from file (one string per line)
f.load('features_string_char_modular.py')
print f.get_features()
#or load fasta file
#f.load_fasta('fasta.fa')
#print f.get_features()
from numpy import * from shogun.Features import * from shogun.Library import MSG_DEBUG order=3 start_order=1 from_order=order hash_bits=2 x=[array([0,1,2,3,0,1,2,3,3,2,2,1,1],dtype=uint8)] print len(x[0]) f=StringByteFeatures(RAWDNA) f.io.set_loglevel(MSG_DEBUG) f.set_features(x) y=HashedWDFeatures(f,start_order,order,from_order,hash_bits) print y.get_dim_feature_space() fm=y.get_feature_matrix() print fm.shape print fm
from shogun.Features import StringCharFeatures, DNA from shogun.Library import DynamicIntArray # create string features with a single string s=10*'A' + 10*'C' + 10*'G' + 10*'T' f=StringCharFeatures([s], DNA) # slide a window of length 5 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(5,1) print f.get_num_vectors() print f.get_vector_length(0) print f.get_vector_length(1) print f.get_features() # slide a window of length 4 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(4,1) print f.get_num_vectors() print f.get_vector_length(0) print f.get_vector_length(1) print f.get_features() # extract string-windows at position 0,6,16,25 of window size 4 # (memory efficient, does not copy strings) f.set_features([s]) positions=DynamicIntArray() positions.append_element(0) positions.append_element(6) positions.append_element(16) positions.append_element(25) f.obtain_by_position_list(4,positions) print f.get_features() # now extract windows of size 8 from same positon list f.obtain_by_position_list(8,positions) print f.get_features()
from shogun.Features import StringCharFeatures, StringUlongFeatures, RAWBYTE from numpy import array, uint64 #create string features cf=StringCharFeatures(['hey','guys','string'], RAWBYTE) uf=StringUlongFeatures(RAWBYTE) #start=0, order=2, gap=0, rev=False) uf.obtain_from_char(cf, 0, 2, 0, False) #and output several stats print "max string length", uf.get_max_vector_length() print "number of strings", uf.get_num_vectors() print "length of first string", uf.get_vector_length(0) print "string[2]", uf.get_feature_vector(2) print "strings", uf.get_features() #replace string 0 uf.set_feature_vector(array([1,2,3,4,5], dtype=uint64), 0) print "strings", uf.get_features()
from shogun.Features import StringCharFeatures, StringWordFeatures, RAWBYTE from numpy import array, uint16 #create string features cf=StringCharFeatures(['hey','guys','string'], RAWBYTE) wf=StringWordFeatures(RAWBYTE) #start=0, order=2, gap=0, rev=False) wf.obtain_from_char(cf, 0, 2, 0, False) #and output several stats print "max string length", wf.get_max_vector_length() print "number of strings", wf.get_num_vectors() print "length of first string", wf.get_vector_length(0) print "string[2]", wf.get_feature_vector(2) print "strings", wf.get_features() #replace string 0 wf.set_feature_vector(array([1,2,3,4,5], dtype=uint16), 0) print "strings", wf.get_features()
###########################################################################
# kernel can be used to maximize AUC instead of margin in SVMs
###########################################################################
def auc ():
print 'AUC'
from shogun.Kernel import GaussianKernel, AUCKernel
from shogun.Features import RealFeatures, Labels
feats_train=RealFeatures(fm_train_real)
width=1.7
subkernel=GaussianKernel(feats_train, feats_train, width)
kernel=AUCKernel(0, subkernel)
kernel.setup_auc_maximization( Labels(label_train_real) )
km_train=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat'))
label_train_real=lm.load_labels('../data/label_train_twoclass.dat')
auc()
###########################################################################
# chi2 kernel
###########################################################################
def chi2 ():
print 'Chi2'
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=1.4
size_cache=10
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat'))
fm_test_real=double(lm.load_numbers('../data/fm_test_real.dat'))
chi2()
def combined_custom():
from shogun.Features import CombinedFeatures, RealFeatures, Labels
from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel
from shogun.Classifier import LibSVM
kernel = CombinedKernel()
feats_train = CombinedFeatures()
tfeats = RealFeatures(fm_train_real)
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, tfeats)
K = tkernel.get_kernel_matrix()
kernel.append_kernel(CustomKernel(K))
subkfeats_train = RealFeatures(fm_train_real)
feats_train.append_feature_obj(subkfeats_train)
subkernel = PolyKernel(10,2)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
labels = Labels(fm_label_twoclass)
svm = LibSVM(1.0, kernel, labels)
svm.train()
kernel = CombinedKernel()
feats_pred = CombinedFeatures()
pfeats = RealFeatures(fm_test_real)
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, pfeats)
K = tkernel.get_kernel_matrix()
kernel.append_kernel(CustomKernel(K))
subkfeats_test = RealFeatures(fm_test_real)
feats_pred.append_feature_obj(subkfeats_test)
subkernel = PolyKernel(10, 2)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_pred)
svm.set_kernel(kernel)
svm.classify()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real = lm.load_numbers('../data/fm_train_real.dat')
fm_test_real = lm.load_numbers('../data/fm_test_real.dat')
fm_label_twoclass = lm.load_labels('../data/label_train_twoclass.dat')
combined_custom()
def combined():
print 'Combined'
from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA
kernel=CombinedKernel()
feats_train=CombinedFeatures()
feats_test=CombinedFeatures()
subkfeats_train=RealFeatures(fm_train_real)
subkfeats_test=RealFeatures(fm_test_real)
subkernel=GaussianKernel(10, 1.1)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
degree=3
subkernel=FixedDegreeStringKernel(10, degree)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
subkernel=LocalAlignmentStringKernel(10)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat'))
fm_test_real=double(lm.load_numbers('../data/fm_test_real.dat'))
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
combined()
def comm_ulong_string ():
print 'CommUlongString'
from shogun.Kernel import CommUlongStringKernel
from shogun.Features import StringUlongFeatures, StringCharFeatures, DNA
from shogun.PreProc import SortUlongString
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringUlongFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortUlongString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringUlongFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
comm_ulong_string()
def comm_word_string ():
print 'CommWordString'
from shogun.Kernel import CommWordStringKernel
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.PreProc import SortWordString
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=CommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
comm_word_string()
def const (): print 'Const' from shogun.Features import DummyFeatures from shogun.Kernel import ConstKernel feats_train=DummyFeatures(10) feats_test=DummyFeatures(17) c=23. kernel=ConstKernel(feats_train, feats_train, c) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': const()
def custom (): print 'Custom' from numpy.random import rand from numpy import array, float32 from shogun.Features import RealFeatures from shogun.Kernel import CustomKernel dim=7 data=rand(dim, dim) feats=RealFeatures(data) symdata=data+data.T lowertriangle=array([symdata[(x,y)] for x in xrange(symdata.shape[1]) for y in xrange(symdata.shape[0]) if y<=x]) kernel=CustomKernel() # once with float64's kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle) km_triangletriangle=kernel.get_kernel_matrix() kernel.set_triangle_kernel_matrix_from_full(symdata) km_fulltriangle=kernel.get_kernel_matrix() kernel.set_full_kernel_matrix_from_full(data) km_fullfull=kernel.get_kernel_matrix() # now once with float32's data=array(data,dtype=float32) kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle) km_triangletriangle=kernel.get_kernel_matrix() kernel.set_triangle_kernel_matrix_from_full(symdata) km_fulltriangle=kernel.get_kernel_matrix() kernel.set_full_kernel_matrix_from_full(data) km_fullfull=kernel.get_kernel_matrix() if __name__=='__main__': from numpy.random import seed seed(42) custom()
def diag (): print 'Diag' from shogun.Features import DummyFeatures from shogun.Kernel import DiagKernel feats_train=DummyFeatures(10) feats_test=DummyFeatures(17) diag=23. kernel=DiagKernel(feats_train, feats_train, diag) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': diag()
def distance ():
print 'Distance'
from shogun.Kernel import DistanceKernel
from shogun.Features import RealFeatures
from shogun.Distance import EuclidianDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=1.7
distance=EuclidianDistance()
kernel=DistanceKernel(feats_train, feats_test, width, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat'))
fm_test_real=double(lm.load_numbers('../data/fm_test_real.dat'))
distance()
def fisher ():
print "Fisher Kernel"
from shogun.Features import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
from shogun.Kernel import PolyKernel
from shogun.Distribution import HMM, BW_NORMAL
N=1 # toy HMM with 1 state
M=4 # 4 observations -> DNA
pseudo=1e-1
order=1
gap=0
reverse=False
kargs=[1, False, True]
# train HMM for positive class
charfeat=StringCharFeatures(fm_hmm_pos, DNA)
hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
pos=HMM(hmm_pos_train, N, M, pseudo)
pos.baum_welch_viterbi_train(BW_NORMAL)
# train HMM for negative class
charfeat=StringCharFeatures(fm_hmm_neg, DNA)
hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
neg=HMM(hmm_neg_train, N, M, pseudo)
neg.baum_welch_viterbi_train(BW_NORMAL)
# Kernel training data
charfeat=StringCharFeatures(fm_train_dna, DNA)
wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
# Kernel testing data
charfeat=StringCharFeatures(fm_test_dna, DNA)
wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
# get kernel on training data
pos.set_observations(wordfeats_train)
neg.set_observations(wordfeats_train)
feats_train=FKFeatures(10, pos, neg)
feats_train.set_opt_a(-1) #estimate prior
kernel=PolyKernel(feats_train, feats_train, *kargs)
km_train=kernel.get_kernel_matrix()
# get kernel on testing data
pos_clone=HMM(pos)
neg_clone=HMM(neg)
pos_clone.set_observations(wordfeats_test)
neg_clone.set_observations(wordfeats_test)
feats_test=FKFeatures(10, pos_clone, neg_clone)
feats_test.set_a(feats_train.get_a()) #use prior from training data
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
fm_hmm_pos=[ fm_train_dna[i] for i in where([label_train_dna==1])[1] ]
fm_hmm_neg=[ fm_train_dna[i] for i in where([label_train_dna==-1])[1] ]
fisher()
def fixed_degree_string ():
print 'FixedDegreeString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import FixedDegreeStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
degree=3
kernel=FixedDegreeStringKernel(feats_train, feats_train, degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
fixed_degree_string()
def gaussian ():
print 'Gaussian'
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=1.9
kernel=GaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
gaussian()
def gaussian_shift ():
print 'GaussianShift'
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianShiftKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=1.8
max_shift=2
shift_step=1
kernel=GaussianShiftKernel(
feats_train, feats_train, width, max_shift, shift_step)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
gaussian_shift()
def plugin_estimate_histogram ():
print 'PluginEstimate w/ HistogramWord'
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels
from shogun.Kernel import HistogramWordStringKernel
from shogun.Classifier import PluginEstimate
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
pie=PluginEstimate()
labels=Labels(label_train_dna)
pie.set_labels(labels)
pie.set_features(feats_train)
pie.train()
kernel=HistogramWordStringKernel(feats_train, feats_train, pie)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
pie.set_features(feats_test)
pie.classify().get_labels()
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
plugin_estimate_histogram()
def gaussian ():
print 'Gaussian'
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Library import AsciiFile, BinaryFile
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=1.9
kernel=GaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
f=AsciiFile("gaussian_train.ascii","w")
kernel.save(f)
del f
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
f=AsciiFile("gaussian_test.ascii","w")
kernel.save(f)
del f
#clean up
import os
os.unlink("gaussian_test.ascii")
os.unlink("gaussian_train.ascii")
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
gaussian()
###########################################################################
# linear kernel on byte features
###########################################################################
def linear_byte():
print 'LinearByte'
from shogun.Kernel import LinearByteKernel
from shogun.Features import ByteFeatures
feats_train=ByteFeatures(fm_train_byte)
feats_test=ByteFeatures(fm_test_byte)
kernel=LinearByteKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import ubyte
lm=LoadMatrix()
fm_train_byte=ubyte(lm.load_numbers('../data/fm_train_byte.dat'))
fm_test_byte=ubyte(lm.load_numbers('../data/fm_test_byte.dat'))
linear_byte()
def linear ():
print 'Linear'
from shogun.Features import RealFeatures
from shogun.Kernel import LinearKernel, AvgDiagKernelNormalizer
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
scale=1.2
kernel=LinearKernel()
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
linear()
def linear_string ():
print 'LinearString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import LinearStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=LinearStringKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
linear_string()
def linear_word ():
print 'LinearWord'
from shogun.Kernel import LinearWordKernel, AvgDiagKernelNormalizer
from shogun.Features import WordFeatures
feats_train=WordFeatures(fm_train_word)
feats_test=WordFeatures(fm_test_word)
scale=1.4
kernel=LinearWordKernel()
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import ushort
lm=LoadMatrix()
fm_train_word=ushort(lm.load_numbers('../data/fm_test_word.dat'))
fm_test_word=ushort(lm.load_numbers('../data/fm_test_word.dat'))
linear_word()
def local_alignment_string():
print 'LocalAlignmentString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import LocalAlignmentStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=LocalAlignmentStringKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
local_alignment_string()
def locality_improved_string ():
print 'LocalityImprovedString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import LocalityImprovedStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
length=5
inner_degree=5
outer_degree=7
kernel=LocalityImprovedStringKernel(
feats_train, feats_train, length, inner_degree, outer_degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
locality_improved_string()
def match_word_string ():
print 'MatchWordString'
from shogun.Kernel import MatchWordStringKernel, AvgDiagKernelNormalizer
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
degree=3
scale=1.4
size_cache=10
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(DNA)
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(DNA)
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
kernel=MatchWordStringKernel(size_cache, degree)
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
match_word_string()
def oligo_string ():
print 'OligoString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import OligoStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
k=3
width=1.2
size_cache=10
kernel=OligoStringKernel(size_cache, k, width)
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
oligo_string()
def poly_match_string ():
print 'PolyMatchString'
from shogun.Kernel import PolyMatchStringKernel
from shogun.Features import StringCharFeatures, DNA
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_train_dna, DNA)
degree=3
inhomogene=False
kernel=PolyMatchStringKernel(feats_train, feats_train, degree, inhomogene)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
poly_match_string()
def poly_match_word_string ():
print 'PolyMatchWordString'
from shogun.Kernel import PolyMatchWordStringKernel
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
degree=2
inhomogene=True
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(DNA)
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(DNA)
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
kernel=PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
poly_match_word_string()
def poly ():
print 'Poly'
from shogun.Features import RealFeatures
from shogun.Kernel import PolyKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
degree=4
inhomogene=False
use_normalization=True
kernel=PolyKernel(
feats_train, feats_train, degree, inhomogene, use_normalization)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
poly()
def plugin_estimate_salzberg ():
print 'PluginEstimate w/ SalzbergWord'
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels
from shogun.Kernel import SalzbergWordStringKernel
from shogun.Classifier import PluginEstimate
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
pie=PluginEstimate()
labels=Labels(label_train_dna)
pie.set_labels(labels)
pie.set_features(feats_train)
pie.train()
kernel=SalzbergWordStringKernel(feats_train, feats_test, pie, labels)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
pie.set_features(feats_test)
pie.classify().get_labels()
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
plugin_estimate_salzberg()
def sigmoid ():
print 'Sigmoid'
from shogun.Features import RealFeatures
from shogun.Kernel import SigmoidKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
size_cache=10
gamma=1.2
coef0=1.3
kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
sigmoid()
def simple_locality_improved_string ():
print 'SimpleLocalityImprovedString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import SimpleLocalityImprovedStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
length=5
inner_degree=5
outer_degree=7
kernel=SimpleLocalityImprovedStringKernel(
feats_train, feats_train, length, inner_degree, outer_degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
simple_locality_improved_string()
def sparse_gaussian ():
print 'SparseGaussian'
from shogun.Features import SparseRealFeatures
from shogun.Kernel import SparseGaussianKernel
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
width=1.1
kernel=SparseGaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
sparse_gaussian()
def sparse_linear ():
print 'SparseLinear'
from shogun.Features import SparseRealFeatures
from shogun.Kernel import SparseLinearKernel, AvgDiagKernelNormalizer
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
scale=1.1
kernel=SparseLinearKernel()
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
sparse_linear()
def sparse_poly ():
print 'SparsePoly'
from shogun.Features import SparseRealFeatures
from shogun.Kernel import SparsePolyKernel
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
size_cache=10
degree=3
inhomogene=True
kernel=SparsePolyKernel(feats_train, feats_train, size_cache, degree,
inhomogene)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
sparse_poly()
def top():
print "TOP Kernel"
from shogun.Features import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
from shogun.Kernel import PolyKernel
from shogun.Distribution import HMM, BW_NORMAL
N=1 # toy HMM with 1 state
M=4 # 4 observations -> DNA
pseudo=1e-1
order=1
gap=0
reverse=False
kargs=[1, False, True]
# train HMM for positive class
charfeat=StringCharFeatures(fm_hmm_pos, DNA)
hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
pos=HMM(hmm_pos_train, N, M, pseudo)
pos.baum_welch_viterbi_train(BW_NORMAL)
# train HMM for negative class
charfeat=StringCharFeatures(fm_hmm_neg, DNA)
hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
neg=HMM(hmm_neg_train, N, M, pseudo)
neg.baum_welch_viterbi_train(BW_NORMAL)
# Kernel training data
charfeat=StringCharFeatures(fm_train_dna, DNA)
wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
# Kernel testing data
charfeat=StringCharFeatures(fm_test_dna, DNA)
wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
# get kernel on training data
pos.set_observations(wordfeats_train)
neg.set_observations(wordfeats_train)
feats_train=TOPFeatures(10, pos, neg, False, False)
kernel=PolyKernel(feats_train, feats_train, *kargs)
km_train=kernel.get_kernel_matrix()
# get kernel on testing data
pos_clone=HMM(pos)
neg_clone=HMM(neg)
pos_clone.set_observations(wordfeats_test)
neg_clone.set_observations(wordfeats_test)
feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False)
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
label_train_dna=lm.load_labels('../data/label_train_dna.dat')
fm_hmm_pos=[ fm_train_dna[i] for i in where([label_train_dna==1])[1] ]
fm_hmm_neg=[ fm_train_dna[i] for i in where([label_train_dna==-1])[1] ]
top()
def weighted_comm_word_string ():
print 'WeightedCommWordString'
from shogun.Kernel import WeightedCommWordStringKernel
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.PreProc import SortWordString
order=3
gap=0
reverse=True
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
weighted_comm_word_string()
def weighted_degree_position_string ():
print 'WeightedDegreePositionString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import WeightedDegreePositionStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
degree=20
kernel=WeightedDegreePositionStringKernel(feats_train, feats_train, degree)
#kernel.set_shifts(zeros(len(data['train'][0]), dtype=int))
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
weighted_degree_position_string()
def weighted_degree_string ():
print 'WeightedDegreeString'
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import WeightedDegreeStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
degree=20
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
#weights=arange(1,degree+1,dtype=double)[::-1]/ \
# sum(arange(1,degree+1,dtype=double))
#kernel.set_wd_weights(weights)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
weighted_degree_string()
from shogun.Features import CombinedFeatures, RealFeatures, Labels
from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel
from shogun.Classifier import MKLClassification
def combined_custom():
##################################
# set up and train
# create some poly train/test matrix
tfeats = RealFeatures(fm_train_real)
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, tfeats)
K_train = tkernel.get_kernel_matrix()
pfeats = RealFeatures(fm_test_real)
tkernel.init(tfeats, pfeats)
K_test = tkernel.get_kernel_matrix()
# create combined train features
feats_train = CombinedFeatures()
feats_train.append_feature_obj(RealFeatures(fm_train_real))
# and corresponding combined kernel
kernel = CombinedKernel()
kernel.append_kernel(CustomKernel(K_train))
kernel.append_kernel(PolyKernel(10,2))
kernel.init(feats_train, feats_train)
# train mkl
labels = Labels(fm_label_twoclass)
mkl = MKLClassification()
# which norm to use for MKL
mkl.set_mkl_norm(1) #2,3
# set cost (neg, pos)
mkl.set_C(1, 1)
# set kernel and labels
mkl.set_kernel(kernel)
mkl.set_labels(labels)
# train
mkl.train()
#w=kernel.get_subkernel_weights()
#kernel.set_subkernel_weights(w)
##################################
# test
# create combined test features
feats_pred = CombinedFeatures()
feats_pred.append_feature_obj(RealFeatures(fm_test_real))
# and corresponding combined kernel
kernel = CombinedKernel()
kernel.append_kernel(CustomKernel(K_test))
kernel.append_kernel(PolyKernel(10, 2))
kernel.init(feats_train, feats_pred)
# and classify
mkl.set_kernel(kernel)
mkl.classify()
if __name__=='__main__':
from tools.load import LoadMatrix
lm = LoadMatrix()
fm_train_real = lm.load_numbers('../data/fm_train_real.dat')
fm_test_real = lm.load_numbers('../data/fm_test_real.dat')
fm_label_twoclass = lm.load_labels('../data/label_train_twoclass.dat')
fm_train_real.shape
fm_test_real.shape
combined_custom()
from shogun.Features import CombinedFeatures, RealFeatures, Labels
from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel
from shogun.Classifier import MKLMultiClass
def mkl_multiclass ():
print 'mkl_multiclass'
width = 1.2
C = 1.2
epsilon = 1e-5
num_threads = 1
kernel = CombinedKernel()
feats_train = CombinedFeatures()
feats_test = CombinedFeatures()
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = GaussianKernel(10, width)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = LinearKernel()
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = PolyKernel(10,2)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
labels = Labels(label_train_multiclass)
mkl = MKLMultiClass(C, kernel, labels)
mkl.set_epsilon(epsilon);
mkl.parallel.set_num_threads(num_threads)
mkl.set_mkl_epsilon(0.001)
mkl.set_mkl_norm(1.5)
mkl.train()
kernel.init(feats_train, feats_test)
out = mkl.classify().get_labels()
print out
if __name__ == '__main__':
from tools.load import LoadMatrix
lm = LoadMatrix()
fm_train_real = lm.load_numbers('../data/fm_train_real.dat')
fm_test_real = lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass = lm.load_labels('../data/label_train_multiclass.dat')
mkl_multiclass()
def log_plus_one ():
print 'LogPlusOne'
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
from shogun.PreProc import LogPlusOne
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=LogPlusOne()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
width=1.4
size_cache=10
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
log_plus_one()
def norm_one ():
print 'NormOne'
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
from shogun.PreProc import NormOne
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=NormOne()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
width=1.4
size_cache=10
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
norm_one()
def prune_var_sub_mean ():
print 'PruneVarSubMean'
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
from shogun.PreProc import PruneVarSubMean
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=PruneVarSubMean()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
width=1.4
size_cache=10
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
prune_var_sub_mean()
def sort_ulong_string ():
print 'CommUlongString'
from shogun.Kernel import CommUlongStringKernel
from shogun.Features import StringCharFeatures, StringUlongFeatures, DNA
from shogun.PreProc import SortUlongString
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringUlongFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringUlongFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortUlongString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
sort_ulong_string()
def sort_word_string ():
print 'CommWordString'
from shogun.Kernel import CommWordStringKernel
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
order=3
gap=0
reverse=False
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=CommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
if __name__=='__main__':
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train_dna=lm.load_dna('../data/fm_train_dna.dat')
fm_test_dna=lm.load_dna('../data/fm_test_dna.dat')
sort_word_string()
###########################################################################
# kernel ridge regression
###########################################################################
def krr ():
print 'KRR'
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Regression import KRR
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
width=0.8
kernel=GaussianKernel(feats_train, feats_train, width)
tau=1e-6
labels=Labels(label_train)
krr=KRR(tau, kernel, labels)
krr.train(feats_train)
kernel.init(feats_train, feats_test)
out = krr.classify().get_labels()
return out
# equivialent shorter version
def krr_short ():
print 'KRR_short'
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Regression import KRR
width=0.8; tau=1e-6
krr=KRR(tau, GaussianKernel(0, width), Labels(label_train))
krr.train(RealFeatures(fm_train))
out = krr.classify(RealFeatures(fm_test)).get_labels()
return out
if __name__=='__main__':
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train=lm.load_numbers('../data/fm_train_real.dat')
fm_test=lm.load_numbers('../data/fm_test_real.dat')
label_train=lm.load_labels('../data/label_train_twoclass.dat')
out1=krr()
out2=krr_short()
def libsvr ():
print 'LibSVR'
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Regression import LibSVR
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
tube_epsilon=1e-2
labels=Labels(label_train)
svr=LibSVR(C, epsilon, kernel, labels)
svr.set_tube_epsilon(tube_epsilon)
svr.train()
kernel.init(feats_train, feats_test)
out1=svr.classify().get_labels()
out2=svr.classify(feats_test).get_labels()
if __name__=='__main__':
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train=lm.load_numbers('../data/fm_train_real.dat')
fm_test=lm.load_numbers('../data/fm_test_real.dat')
label_train=lm.load_labels('../data/label_train_twoclass.dat')
libsvr()
###########################################################################
# svm light based support vector regression
###########################################################################
def svr_light ():
print 'SVRLight'
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
try:
from shogun.Regression import SVRLight
except ImportError:
print 'No support for SVRLight available.'
return
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
tube_epsilon=1e-2
num_threads=3
labels=Labels(label_train)
svr=SVRLight(C, epsilon, kernel, labels)
svr.set_tube_epsilon(tube_epsilon)
svr.parallel.set_num_threads(num_threads)
svr.train()
kernel.init(feats_train, feats_test)
svr.classify().get_labels()
if __name__=='__main__':
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
fm_train=lm.load_numbers('../data/fm_train_real.dat')
fm_test=lm.load_numbers('../data/fm_test_real.dat')
label_train=lm.load_labels('../data/label_train_twoclass.dat')
svr_light()
from shogun.Features import *
from shogun.Library import MSG_DEBUG
from shogun.Features import StringCharFeatures, Labels, DNA, Alphabet
from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
from shogun.Classifier import SVMLight
from numpy import *
from numpy.random import randn
import sys
import types
import random
import bz2
import cPickle
import inspect
def save(filename, myobj):
"""
save object to file using pickle
@param filename: name of destination file
@type filename: str
@param myobj: object to save (has to be pickleable)
@type myobj: obj
"""
try:
f = bz2.BZ2File(filename, 'wb')
except IOError, details:
sys.stderr.write('File ' + filename + ' cannot be written\n')
sys.stderr.write(details)
return
cPickle.dump(myobj, f, protocol=2)
f.close()
def load(filename):
"""
Load from filename using pickle
@param filename: name of file to load from
@type filename: str
"""
try:
f = bz2.BZ2File(filename, 'rb')
except IOError, details:
sys.stderr.write('File ' + filename + ' cannot be read\n')
sys.stderr.write(details)
return
myobj = cPickle.load(f)
f.close()
return myobj
##################################################
num=10
dist=1
width=2.1
traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1)
testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1);
trainlab=concatenate((-ones(num), ones(num)));
testlab=concatenate((-ones(num), ones(num)));
feats_train=RealFeatures(traindata_real);
feats_test=RealFeatures(testdata_real);
kernel=GaussianKernel(feats_train, feats_train, width);
kernel.io.set_loglevel(MSG_DEBUG)
labels=Labels(trainlab);
svm=SVMLight(2, kernel, labels)
svm.train()
svm.io.set_loglevel(MSG_DEBUG)
##################################################
print "labels:"
print labels.to_string()
print "features"
print feats_train.to_string()
print "kernel"
print kernel.to_string()
print "svm"
print svm.to_string()
print "#################################"
fn = "serialized_svm.bz2"
print "serializing SVM to file", fn
save(fn, svm)
print "#################################"
print "unserializing SVM"
svm2 = load(fn)
print "#################################"
print "comparing training"
svm2.train()
print "objective before serialization:", svm.get_objective()
print "objective after serialization:", svm2.get_objective()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from shogun.Structure import *
import numpy
from numpy import array,Inf,float64,matrix,frompyfunc,zeros
#from IPython.Shell import IPShellEmbed
#ipshell = IPShellEmbed()
import gzip
import scipy
from scipy.io import loadmat
import pickle
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
if scipy.__version__ >= '0.7.0':
renametable = {
'scipy.io.mio5': 'scipy.io.matlab.mio5',
'scipy.sparse.sparse' : 'scipy.sparse',
}
else:
renametable = {}
def mapname(name):
if name in renametable:
return renametable[name]
return name
def mapped_load_global(self):
module = mapname(self.readline()[:-1])
name = mapname(self.readline()[:-1])
klass = self.find_class(module, name)
self.append(klass)
def loads(str):
file = StringIO(str)
unpickler = pickle.Unpickler(file)
unpickler.dispatch[pickle.GLOBAL] = mapped_load_global
return unpickler.load()
def run_test():
data_dict = loads(gzip.GzipFile('../data/DynProg_example_py.pickle.gz').read())
#data_dict = loadmat('../data/DynProg_example_py.dat.mat', appendmat=False, struct_as_record=False)
#print data_dict
#print len(data_dict['penalty_array'][0][0][0][0].limits[0])
num_plifs,num_limits = len(data_dict['penalty_array']),len(data_dict['penalty_array'][0].limits)
pm = PlifMatrix()
pm.create_plifs(num_plifs,num_limits)
ids = numpy.array(range(num_plifs),dtype=numpy.int32)
min_values = numpy.array(range(num_plifs),dtype=numpy.float64)
max_values = numpy.array(range(num_plifs),dtype=numpy.float64)
all_use_cache = numpy.array(range(num_plifs),dtype=numpy.bool)
all_use_svm = numpy.array(range(num_plifs),dtype=numpy.int32)
all_limits = zeros((num_plifs,num_limits))
all_penalties = zeros((num_plifs,num_limits))
all_names = ['']*num_plifs
all_transforms = ['']*num_plifs
for plif_idx in range(num_plifs):
ids[plif_idx] = data_dict['penalty_array'][plif_idx].id-1
min_values[plif_idx] = data_dict['penalty_array'][plif_idx].min_value
max_values[plif_idx] = data_dict['penalty_array'][plif_idx].max_value
all_use_cache[plif_idx] = data_dict['penalty_array'][plif_idx].use_cache
all_use_svm[plif_idx] = data_dict['penalty_array'][plif_idx].use_svm
all_limits[plif_idx] = data_dict['penalty_array'][plif_idx].limits
all_penalties[plif_idx] = data_dict['penalty_array'][plif_idx].penalties
all_names[plif_idx] = str(data_dict['penalty_array'][plif_idx].name)
all_transforms[plif_idx] = str(data_dict['penalty_array'][plif_idx].transform)
if all_transforms[plif_idx] == '[]':
all_transforms[plif_idx] = 'linear'
pm.set_plif_ids(ids)
pm.set_plif_min_values(min_values)
pm.set_plif_max_values(max_values)
pm.set_plif_use_cache(all_use_cache)
pm.set_plif_use_svm(all_use_svm)
pm.set_plif_limits(all_limits)
pm.set_plif_penalties(all_penalties)
#pm.set_plif_names(all_names)
#pm.set_plif_transform_type(all_transforms)
transition_ptrs = data_dict['model'].transition_pointers
transition_ptrs = transition_ptrs[:,:,0:2]
transition_ptrs = transition_ptrs.astype(numpy.float64)
pm.compute_plif_matrix(transition_ptrs)
# init_dyn_prog
num_svms = 8
dyn = DynProg(num_svms)
orf_info = data_dict['model'].orf_info
orf_info = orf_info.astype(numpy.int32)
num_states = orf_info.shape[0]
dyn.set_num_states(num_states)
block = data_dict['block']
seq_len = len(block.seq)
seq = str(block.seq)
gene_string = array([elem for elem in seq])
# precompute_content_svms
pos = block.all_pos-1
pos = pos.astype(numpy.int32)
snd_pos = pos
dyn.set_pos(pos)
dyn.set_gene_string(gene_string)
dyn.create_word_string()
dyn.precompute_stop_codons()
dyn.init_content_svm_value_array(num_svms)
dict_weights = data_dict['content_weights']
dict_weights = dict_weights.reshape(8,1).astype(numpy.float64)
dict_weights = zeros((8,5440))
dyn.set_dict_weights(dict_weights.T)
dyn.precompute_content_values()
dyn.init_mod_words_array(data_dict['model'].mod_words.astype(numpy.int32))
pm.compute_signal_plifs(data_dict['state_signals'].astype(numpy.int32))
dyn.set_orf_info(orf_info)
#
p = data_dict['model'].p
q = data_dict['model'].q
dyn.set_p_vector(p)
dyn.set_q_vector(q)
a_trans = data_dict['a_trans']
a_trans = a_trans.astype(float64)
dyn.set_a_trans_matrix(a_trans)
dyn.check_svm_arrays()
features = data_dict['block'].features
dyn.set_observation_matrix(features)
dyn.set_content_type_array(data_dict['seg_path'].astype(numpy.float64))
dyn.best_path_set_segment_loss(data_dict['loss'].astype(numpy.float64))
use_orf = True
feat_dims = [25,201,2]
dyn.set_plif_matrices(pm);
dyn.compute_nbest_paths(features.shape[2], use_orf, 1,True,False)
# fetch results
states = dyn.get_states()
print states
scores = dyn.get_scores()
print scores
positions = dyn.get_positions()
print positions
if __name__ == '__main__':
run_test()
import gc
from shogun.Features import Alphabet,StringCharFeatures,StringWordFeatures,DNA
from shogun.PreProc import SortWordString, MSG_DEBUG
from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer
from numpy import mat
POS=[100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT']
NEG=[100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT',
100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT',
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT']
order=7
gap=0
reverse=False
for i in xrange(10):
alpha=Alphabet(DNA)
traindat=StringCharFeatures(alpha)
traindat.set_features(POS+NEG)
trainudat=StringWordFeatures(traindat.get_alphabet());
trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
#trainudat.io.set_loglevel(MSG_DEBUG)
pre = SortWordString()
#pre.io.set_loglevel(MSG_DEBUG)
pre.init(trainudat)
trainudat.add_preproc(pre)
trainudat.apply_preproc()
spec = CommWordStringKernel(10, False)
spec.set_normalizer(IdentityKernelNormalizer())
spec.init(trainudat, trainudat)
K=mat(spec.get_kernel_matrix())
del POS
del NEG
del order
del gap
del reverse