Created
December 5, 2013 21:08
-
-
Save cwidmer/7813931 to your computer and use it in GitHub Desktop.
python module to create shogun objects for dealing with string kernels and string data withing the COFFIN framework
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.5 | |
# This program is free software; you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation; either version 2 of the License, or | |
# (at your option) any later version. | |
# | |
# Written (W) 2010-2013 Christian Widmer | |
# Copyright (C) 2010-2013 Max-Planck-Society, TU-Berlin, MSKCC | |
""" | |
module to create shogun data objects according to given parameters | |
""" | |
from shogun.Classifier import SVMLight, LibLinear, L2R_LR | |
from shogun.Kernel import WeightedDegreeStringKernel, LinearKernel, PolyKernel, GaussianKernel, CTaxonomy | |
from shogun.Features import StringCharFeatures, RealFeatures, CombinedFeatures, StringWordFeatures, SortWordString | |
from shogun.Features import DNA, PROTEIN, BinaryLabels | |
from shogun.Kernel import WeightedDegreeStringKernel, CombinedKernel, WeightedCommWordStringKernel, WeightedDegreePositionStringKernel, WeightedDegreeRBFKernel | |
from shogun.Features import StringCharFeatures, DNA, StringWordFeatures, CombinedFeatures | |
from shogun.Features import CombinedDotFeatures, HashedWDFeatures, HashedWDFeaturesTransposed, WDFeatures, ImplicitWeightedSpecFeatures, StringByteFeatures | |
import numpy | |
def create_labels(labels): | |
""" | |
create shogun labels | |
""" | |
return BinaryLabels(numpy.double(labels)) | |
######################################################## | |
# string-kernel based stuff | |
######################################################## | |
def get_spectrum_features(data, order=3, gap=0, reverse=True): | |
""" | |
create feature object used by spectrum kernel | |
""" | |
charfeat = StringCharFeatures(data, DNA) | |
feat = StringWordFeatures(charfeat.get_alphabet()) | |
feat.obtain_from_char(charfeat, order-1, order, gap, reverse) | |
preproc = SortWordString() | |
preproc.init(feat) | |
feat.add_preprocessor(preproc) | |
feat.apply_preprocessor() | |
return feat | |
def get_wd_features(data, feat_type="dna"): | |
""" | |
create feature object for wdk | |
""" | |
if feat_type == "dna": | |
feat = StringCharFeatures(DNA) | |
elif feat_type == "protein": | |
feat = StringCharFeatures(PROTEIN) | |
else: | |
raise Exception("unknown feature type") | |
feat.set_features(data) | |
return feat | |
def create_empty_promoter_kernel(degree_wdk, kernel_cache=1000): | |
""" | |
creates an uninitialized promoter kernel | |
""" | |
kernel_center = WeightedDegreeStringKernel(degree_wdk) | |
""" | |
#TODO: enable shifts | |
# centered WDK/WDK-shift | |
if True: | |
else: | |
kernel_center = WeightedDegreePositionStringKernel(10, param["degree"]) | |
shifts_vector = numpy.ones(param["center_offset"]*2, dtype=numpy.int32)*param["shifts"] | |
kernel_center.set_shifts(shifts_vector) | |
""" | |
kernel_center.set_cache_size(kernel_cache/3) | |
# border spetrum kernels | |
size = kernel_cache/3 | |
use_sign = False | |
kernel_left = WeightedCommWordStringKernel(size, use_sign) | |
kernel_right = WeightedCommWordStringKernel(size, use_sign) | |
# assemble combined kernel | |
kernel = CombinedKernel() | |
kernel.append_kernel(kernel_center) | |
kernel.append_kernel(kernel_left) | |
kernel.append_kernel(kernel_right) | |
return kernel | |
def create_promoter_kernel(examples, center_offset, center_pos, degree_wdk, degree_spectrum, kernel_cache=1000): | |
""" | |
creates a promoter kernel | |
""" | |
# create uninitialized kernel | |
kernel = create_empty_promoter_kernel(degree_wdk, kernel_cache) | |
# get features | |
feat = create_promoter_features(examples, center_offset, center_pos) | |
# init combined kernel | |
kernel.init(feat, feat) | |
return kernel | |
def create_promoter_features(data, center_offset, center_pos): | |
""" | |
creates promoter combined features | |
""" | |
print "creating promoter features" | |
(center, left, right) = split_data_promoter(data, center_offset, center_pos) | |
# sanity check sequences | |
assert len(center) == len(left) == len(right) | |
for i in xrange(1, len(center)): | |
assert len(data[i]) == len(data[0]), "data length mismatch %i: %i!=%i" % (i, len(data[0]), len(data[i])) | |
assert len(center[i]) == len(center[0]), "center length mismatch %i: %i!=%i" % (i, len(center[0]), len(center[i])) | |
assert len(left[i]) == len(left[0]), "left length mismatch %i: %i!=%i" % (i, len(left[0]), len(left[i])) | |
assert len(right[i]) == len(right[0]), "right length mismatch %i: %i!=%i" % (i, len(right[0]), len(right[i])) | |
# set up base features | |
feat_center = StringCharFeatures(DNA) | |
feat_center.set_features(center) | |
feat_left = get_spectrum_features(left) | |
feat_right = get_spectrum_features(right) | |
# construct combined features | |
feat = CombinedFeatures() | |
feat.append_feature_obj(feat_center) | |
feat.append_feature_obj(feat_left) | |
feat.append_feature_obj(feat_right) | |
return feat | |
def split_data_promoter(data, center_offset, center_pos): | |
''' | |
split promoter data in three parts | |
@param data: | |
''' | |
center = [seq[(center_pos - center_offset):(center_pos + center_offset)] for seq in data] | |
left = [seq[0:center_pos] for seq in data] | |
right = [seq[center_pos:] for seq in data] | |
#print left, center, right | |
return (center, left, right) | |
######################################################## | |
# linear stuff | |
######################################################## | |
def create_hashed_promoter_features(data, center_offset, center_pos, degree_wdk, degree_spectrum): | |
""" | |
creates a promoter feature object | |
""" | |
print "creating __hashed__ promoter features (for linear SVM)" | |
(center, left, right) = split_data_promoter(data, center_offset, center_pos) | |
# set up base features | |
feats_center = create_hashed_features_wdk(center, degree_wdk) | |
feats_left = create_hashed_features_spectrum(left, degree_spectrum) | |
feats_right = create_hashed_features_spectrum(right, degree_spectrum) | |
# create combined features | |
feats = CombinedDotFeatures() | |
feats.append_feature_obj(feats_center) | |
feats.append_feature_obj(feats_left) | |
feats.append_feature_obj(feats_right) | |
return feats | |
def create_hashed_features_wdk(data, degree): | |
""" | |
creates hashed dot features for the wdk | |
""" | |
# fix parameters | |
start_degree = 0 | |
hash_bits = 4 | |
order = 1 | |
gap = 0 | |
reverse = True | |
# create raw features | |
feats_char = StringCharFeatures(data, DNA) | |
feats_raw = StringByteFeatures(DNA) | |
feats_raw.obtain_from_char(feats_char, order-1, order, gap, reverse) | |
# finish up | |
feats = HashedWDFeaturesTransposed(feats_raw, start_degree, degree, degree, hash_bits) | |
#feats = HashedWDFeatures(feats_raw, start_degree, degree, degree, hash_bits) | |
#feats = WDFeatures(feats_raw, 1, 8)#, degree, hash_bits) | |
return feats | |
def create_hashed_features_spectrum(data, degree): | |
""" | |
creates hashed dot features for the spectrum kernel | |
""" | |
# extract parameters | |
order = degree | |
# fixed parameters | |
gap = 0 | |
reverse = True | |
normalize = True | |
# create features | |
feats_char = StringCharFeatures(data, DNA) | |
feats_word = StringWordFeatures(feats_char.get_alphabet()) | |
feats_word.obtain_from_char(feats_char, order-1, order, gap, reverse) | |
# create preproc | |
preproc = SortWordString() | |
preproc.init(feats_word) | |
feats_word.add_preprocessor(preproc) | |
feats_word.apply_preprocessor() | |
# finish | |
feats = ImplicitWeightedSpecFeatures(feats_word, normalize) | |
return feats | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment