2019
Petegrosso, Raphael; Li, Zhuliu; Srour, Molly A.; Saad, Yousef; Zhang, Wei; Kuang, Rui
Scalable Remote Homology Detection and Fold Recognition in Massive Protein Networks Journal Article
In: PROTEINS: Structure, Function, and Bioinformatics, vol. 87, no. 6, pp. 478-491, 2019.
Abstract | Links | BibTeX | Tags: Protein Remote Homology Detection
@article{scalable2019petegrosso,
title = {Scalable Remote Homology Detection and Fold Recognition in Massive Protein Networks},
author = {Raphael Petegrosso and Zhuliu Li and Molly A. Srour and Yousef Saad and Wei Zhang and Rui Kuang},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/prot.25669},
year = {2019},
date = {2019-01-31},
journal = {PROTEINS: Structure, Function, and Bioinformatics},
volume = {87},
number = {6},
pages = {478-491},
abstract = {The global connectivities in very large protein similarity networks contain traces of evolution among the proteins for detecting protein remote evolutionary relations or structural similarities. To investigate how well a protein network captures the evolutionary information, a key limitation is the intensive computation of pairwise sequence similarities needed to construct very large protein networks. In this paper, we introduce Label Propagation on Low-rank Kernel Approximation (LP-LOKA) for searching massively large protein networks. LP-LOKA propagates initial protein similarities in a low-rank graph by Nystrom approximation without computing all pairwise similarities. With scalable parallel implementations based on distributed-memory using message-passing interface and Apache-Hadoop/Spark on cloud, LP-LOKA can search protein networks with one million proteins or more. In the experiments on Swiss-Prot/ADDA/CASP data, LP-LOKA significantly improved protein ranking over the widely used HMM-HMM or profile-sequence alignment methods utilizing large protein networks. It was observed that the larger the protein similarity network, the better the performance, especially on relatively small protein superfamilies and folds. The results suggest that computing massively large protein network is necessary to meet the growing need of annotating proteins from newly sequenced species and LP-LOKA is both scalable and accurate for searching massively large protein networks.},
keywords = {Protein Remote Homology Detection},
pubstate = {published},
tppubtype = {article}
}
2009
Min, Martin Renqiang; Kuang, Rui; Bonner, Anthony J; Zhang, Zhaolei
Learning Random-Walk Kernels for Protein Remote Homology Identification and Motif Discovery. Proceedings Article
In: SDM, pp. 133–144, SIAM 2009, ISBN: 978-0-89871-682-5.
Abstract | Links | BibTeX | Tags: Kernel Method, Protein Remote Homology Detection
@inproceedings{min2009learning,
title = {Learning Random-Walk Kernels for Protein Remote Homology Identification and Motif Discovery.},
author = {Martin Renqiang Min and Rui Kuang and Anthony J Bonner and Zhaolei Zhang},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/12E97816119727952E12.pdf},
doi = {10.1137/1.9781611972795.12},
isbn = {978-0-89871-682-5},
year = {2009},
date = {2009-04-30},
booktitle = {SDM},
pages = {133--144},
organization = {SIAM},
abstract = {Random-walk based algorithms are good choices for solving many classification problems with limited labeled data and a large amount of unlabeled data. However, it is difficult to choose the optimal number of random steps, and the results are very sensitive to the parameter chosen. In this paper, we will discuss how to better identify protein remote homology than any other algorithm using a learned random-walk kernel based on a positive linear combination of random-walk kernels with different random steps, which leads to a convex combination of kernels. The resulting kernel has much better prediction performance than the state-of-the-art profile kernel for protein remote homology identification. On the SCOP benchmark dataset, the overall mean ROC50 score on 54 protein families we obtained using the new kernel is above 0.90, which has almost perfect prediction performance on most of the 54 families and has significant improvement over the best published result; moreover, our approach based on learned random-walk kernels can effectively identify meaningful protein sequence motifs that are responsible for discriminating the memberships of protein sequences' remote homology in SCOP.},
keywords = {Kernel Method, Protein Remote Homology Detection},
pubstate = {published},
tppubtype = {inproceedings}
}
Ngo, Thanh; Kuang, Rui
Partial profile alignment kernels for protein classification Proceedings Article
In: 2009 IEEE International Workshop on Genomic Signal Processing and Statistics, pp. 1–4, IEEE 2009, ISBN: 978-1-4244-4761-9.
Abstract | Links | BibTeX | Tags: Kernel Method, Protein Remote Homology Detection
@inproceedings{ngo2009partial,
title = {Partial profile alignment kernels for protein classification},
author = {Thanh Ngo and Rui Kuang},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/05174328.pdf},
doi = {10.1109/GENSIPS.2009.5174328},
isbn = {978-1-4244-4761-9},
year = {2009},
date = {2009-01-01},
booktitle = {2009 IEEE International Workshop on Genomic Signal Processing and Statistics},
pages = {1--4},
organization = {IEEE},
abstract = {Remote homology detection and fold recognition are the central problems in protein classification. In real applications, kernel algorithms that are both accurate and efficient are required for classification of large databases. We explore a class of partial profile alignment kernels to be used with support vector machines (SVMs) for remote homology detection and fold recognition. While existing profile-based kernels use the whole profiles to determine the similarity between pairs of proteins, the partial profile alignment kernels are derived from part of the position specific scoring matrices (PSSMs) in the profiles for alignment. Specifically, at each position in the PSSM, only amino acids in the mutation neighborhood of the corresponding amino acid in the original protein sequence are considered for alignment to remove noise and improve computing efficiency. Our experiments on SCOP bench datasets show that the partial profile alignment kernels achieved overall better classification results for both fold recognition and remote homology detection than profile kernels and profile-alignment kernels. In addition, our algorithm using only a fraction of the profiles saves the cost of computing the kernels significantly, compared to the full-profile alignment methods.},
keywords = {Kernel Method, Protein Remote Homology Detection},
pubstate = {published},
tppubtype = {inproceedings}
}
2008
Kuang, Rui; Gu, Jianying; Cai, Hong; Wang, Yufeng
Improved prediction of malaria degradomes by supervised learning with SVM and profile kernel Journal Article
In: Genetica, vol. 136, no. 1, pp. 189–209, 2008.
Abstract | Links | BibTeX | Tags: Protein Remote Homology Detection
@article{kuang2009improved,
title = {Improved prediction of malaria degradomes by supervised learning with SVM and profile kernel},
author = {Rui Kuang and Jianying Gu and Hong Cai and Yufeng Wang},
url = {http://link.springer.com/article/10.1007/s10709-008-9336-9},
doi = {10.1007/s10709-008-9336-9},
year = {2008},
date = {2008-12-06},
journal = {Genetica},
volume = {136},
number = {1},
pages = {189--209},
publisher = {Springer},
abstract = {The spread of drug resistance through malaria parasite populations calls for the development of new therapeutic strategies. However, the seemingly promising genomics-driven target identification paradigm is hampered by the weak annotation coverage. To identify potentially important yet uncharacterized proteins, we apply support vector machines using profile kernels, a supervised discriminative machine learning technique for remote homology detection, as a complement to the traditional alignment based algorithms. In this study, we focus on the prediction of proteases, which have long been considered attractive drug targets because of their indispensable roles in parasite development and infection. Our analysis demonstrates that an abundant and complex repertoire is conserved in five Plasmodium parasite species. Several putative proteases may be important components in networks that mediate cellular processes, including hemoglobin digestion, invasion, trafficking, cell cycle fate, and signal transduction. This catalog of proteases provides a short list of targets for functional characterization and rational inhibitor design.},
keywords = {Protein Remote Homology Detection},
pubstate = {published},
tppubtype = {article}
}
2007
Melvin, Iain; Ie, Eugene; Kuang, Rui; Weston, Jason; Noble, William Stafford; Leslie, Christina
SVM-Fold: a tool for discriminative multi-class protein fold and superfamily recognition Journal Article
In: BMC bioinformatics, vol. 8, no. 4, 2007.
Abstract | Links | BibTeX | Tags: Protein Remote Homology Detection, String Kernels
@article{melvin2007svm,
title = {SVM-Fold: a tool for discriminative multi-class protein fold and superfamily recognition},
author = {Iain Melvin and Eugene Ie and Rui Kuang and Jason Weston and William Stafford Noble and Christina Leslie},
url = {http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-S4-S2},
doi = {10.1186/1471-2105-8-S4-S2},
year = {2007},
date = {2007-05-22},
journal = {BMC bioinformatics},
volume = {8},
number = {4},
publisher = {BioMed Central},
abstract = {Background
Predicting a protein's structural class from its amino acid sequence is a fundamental problem in computational biology. Much recent work has focused on developing new representations for protein sequences, called string kernels, for use with support vector machine (SVM) classifiers. However, while some of these approaches exhibit state-of-the-art performance at the binary protein classification problem, i.e. discriminating between a particular protein class and all other classes, few of these studies have addressed the real problem of multi-class superfamily or fold recognition. Moreover, there are only limited software tools and systems for SVM-based protein classification available to the bioinformatics community.
Results
We present a new multi-class SVM-based protein fold and superfamily recognition system and web server called SVM-Fold, which can be found at http://svm-fold.c2b2.columbia.edu. Our system uses an efficient implementation of a state-of-the-art string kernel for sequence profiles, called the profile kernel, where the underlying feature representation is a histogram of inexact matching k-mer frequencies. We also employ a novel machine learning approach to solve the difficult multi-class problem of classifying a sequence of amino acids into one of many known protein structural classes. Binary one-vs-the-rest SVM classifiers that are trained to recognize individual structural classes yield prediction scores that are not comparable, so that standard "one-vs-all" classification fails to perform well. Moreover, SVMs for classes at different levels of the protein structural hierarchy may make useful predictions, but one-vs-all does not try to combine these multiple predictions. To deal with these problems, our method learns relative weights between one-vs-the-rest classifiers and encodes information about the protein structural hierarchy for multi-class prediction. In large-scale benchmark results based on the SCOP database, our code weighting approach significantly improves on the standard one-vs-all method for both the superfamily and fold prediction in the remote homology setting and on the fold recognition problem. Moreover, our code weight learning algorithm strongly outperforms nearest-neighbor methods based on PSI-BLAST in terms of prediction accuracy on every structure classification problem we consider.
Conclusion
By combining state-of-the-art SVM kernel methods with a novel multi-class algorithm, the SVM-Fold system delivers efficient and accurate protein fold and superfamily recognition.},
keywords = {Protein Remote Homology Detection, String Kernels},
pubstate = {published},
tppubtype = {article}
}
Predicting a protein's structural class from its amino acid sequence is a fundamental problem in computational biology. Much recent work has focused on developing new representations for protein sequences, called string kernels, for use with support vector machine (SVM) classifiers. However, while some of these approaches exhibit state-of-the-art performance at the binary protein classification problem, i.e. discriminating between a particular protein class and all other classes, few of these studies have addressed the real problem of multi-class superfamily or fold recognition. Moreover, there are only limited software tools and systems for SVM-based protein classification available to the bioinformatics community.
Results
We present a new multi-class SVM-based protein fold and superfamily recognition system and web server called SVM-Fold, which can be found at http://svm-fold.c2b2.columbia.edu. Our system uses an efficient implementation of a state-of-the-art string kernel for sequence profiles, called the profile kernel, where the underlying feature representation is a histogram of inexact matching k-mer frequencies. We also employ a novel machine learning approach to solve the difficult multi-class problem of classifying a sequence of amino acids into one of many known protein structural classes. Binary one-vs-the-rest SVM classifiers that are trained to recognize individual structural classes yield prediction scores that are not comparable, so that standard "one-vs-all" classification fails to perform well. Moreover, SVMs for classes at different levels of the protein structural hierarchy may make useful predictions, but one-vs-all does not try to combine these multiple predictions. To deal with these problems, our method learns relative weights between one-vs-the-rest classifiers and encodes information about the protein structural hierarchy for multi-class prediction. In large-scale benchmark results based on the SCOP database, our code weighting approach significantly improves on the standard one-vs-all method for both the superfamily and fold prediction in the remote homology setting and on the fold recognition problem. Moreover, our code weight learning algorithm strongly outperforms nearest-neighbor methods based on PSI-BLAST in terms of prediction accuracy on every structure classification problem we consider.
Conclusion
By combining state-of-the-art SVM kernel methods with a novel multi-class algorithm, the SVM-Fold system delivers efficient and accurate protein fold and superfamily recognition.
2006
Weston, Jason; Kuang, Rui; Leslie, Christina; Noble, William Stafford
Protein ranking by semi-supervised network propagation Journal Article
In: BMC bioinformatics, vol. 7, no. 1, pp. 9, 2006.
Abstract | Links | BibTeX | Tags: Protein Remote Homology Detection
@article{weston2006protein,
title = {Protein ranking by semi-supervised network propagation},
author = {Jason Weston and Rui Kuang and Christina Leslie and William Stafford Noble},
url = {http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-7-S1-S10},
doi = {10.1186/1471-2105-7-S1-S10},
year = {2006},
date = {2006-03-20},
journal = {BMC bioinformatics},
volume = {7},
number = {1},
pages = {9},
publisher = {BioMed Central},
abstract = {Background: Biologists regularly search DNA or protein databases for sequences that share an evolutionary or functional relationship with a given query sequence. Traditional search methods, such as BLAST and PSI-BLAST, focus on detecting statistically significant pairwise sequence alignments and often miss more subtle sequence similarity. Recent work in the machine learning community has shown that exploiting the global structure of the network defined by these pairwise similarities can help detect more remote relationships than a purely local measure.
Methods: We review RankProp, a ranking algorithm that exploits the global network structure of similarity relationships among proteins in a database by performing a diffusion operation on a protein similarity network with weighted edges. The original RankProp algorithm is unsupervised. Here, we describe a semi-supervised version of the algorithm that uses labeled examples. Three possible ways of incorporating label information are considered: (i) as a validation set for model selection, (ii) to learn a new network, by choosing which transfer function to use for a given query, and (iii) to estimate edge weights, which measure the probability of inferring structural similarity.
Results: Benchmarked on a human-curated database of protein structures, the original RankProp algorithm provides significant improvement over local network search algorithms such as PSIBLAST. Furthermore, we show here that labeled data can be used to learn a network without any need for estimating parameters of the transfer function, and that diffusion on this learned network produces better results than the original RankProp algorithm with a fixed network.
Conclusion: In order to gain maximal information from a network, labeled and unlabeled data should be used to extract both local and global structure.},
keywords = {Protein Remote Homology Detection},
pubstate = {published},
tppubtype = {article}
}
Methods: We review RankProp, a ranking algorithm that exploits the global network structure of similarity relationships among proteins in a database by performing a diffusion operation on a protein similarity network with weighted edges. The original RankProp algorithm is unsupervised. Here, we describe a semi-supervised version of the algorithm that uses labeled examples. Three possible ways of incorporating label information are considered: (i) as a validation set for model selection, (ii) to learn a new network, by choosing which transfer function to use for a given query, and (iii) to estimate edge weights, which measure the probability of inferring structural similarity.
Results: Benchmarked on a human-curated database of protein structures, the original RankProp algorithm provides significant improvement over local network search algorithms such as PSIBLAST. Furthermore, we show here that labeled data can be used to learn a network without any need for estimating parameters of the transfer function, and that diffusion on this learned network produces better results than the original RankProp algorithm with a fixed network.
Conclusion: In order to gain maximal information from a network, labeled and unlabeled data should be used to extract both local and global structure.
2005
Noble, William Stafford; Kuang, Rui; Leslie, Christina; Weston, Jason
Idetifying remote protein homologs by network propagation Journal Article
In: FEBS J, vol. 272, no. 20, 2005.
Abstract | Links | BibTeX | Tags: Protein Remote Homology Detection
@article{noble2005idetifying,
title = {Idetifying remote protein homologs by network propagation},
author = {William Stafford Noble and Rui Kuang and Christina Leslie and Jason Weston},
url = {http://onlinelibrary.wiley.com/doi/10.1111/j.1742-4658.2005.04947.x/abstract},
doi = {10.1111/j.1742-4658.2005.04947.x},
year = {2005},
date = {2005-10-07},
journal = {FEBS J},
volume = {272},
number = {20},
abstract = {Perhaps the most widely used applications of bioinformatics are tools such as psi-blast for searching sequence databases. We describe a recently developed protein database search algorithm called rankprop. rankprop relies upon a precomputed network of pairwise protein similarities. The algorithm performs a diffusion operation from a specified query protein across the protein similarity network. The resulting activation scores, assigned to each database protein, encode information about the global structure of the protein similarity network. This type of algorithm has a rich history in associationist psychology, artificial intelligence and web search. We describe the rankprop algorithm and its relatives, and we provide evidence that the algorithm successfully improves upon the rankings produced by psi-blast.},
keywords = {Protein Remote Homology Detection},
pubstate = {published},
tppubtype = {article}
}
Kuang, Rui; Ie, Eugene; Wang, Ke; Wang, Kai; Siddiqi, Mahira; Freund, Yoav; Leslie, Christina
Profile-based string kernels for remote homology detection and motif extraction Journal Article
In: Journal of bioinformatics and computational biology, vol. 3, no. 03, 2005.
Abstract | Links | BibTeX | Tags: Protein Remote Homology Detection, String Kernels
@article{kuang2005profile,
title = {Profile-based string kernels for remote homology detection and motif extraction},
author = {Rui Kuang and Eugene Ie and Ke Wang and Kai Wang and Mahira Siddiqi and Yoav Freund and Christina Leslie},
url = {http://compbio.cs.umn.edu/paper/jbcb-profile-kernel.pdf},
doi = {http://dx.doi.org/10.1142/S021972000500120X},
year = {2005},
date = {2005-10-02},
journal = {Journal of bioinformatics and computational biology},
volume = {3},
number = {03},
publisher = {World Scientific},
abstract = {We introduce novel profile-based string kernels for use with support vector machines (SVMs) for the problems of protein classification and remote homology detection. These kernels use probabilistic profiles, such as those produced by the PSI-BLAST algorithm, to define position-dependent mutation neighborhoods along protein sequences for inexact matching of k-length subsequences (“k-mers”) in the data. By use of an efficient data structure, the kernels are fast to compute once the profiles have been obtained. For example, the time needed to run PSI-BLAST in order to build the profiles is significantly longer than both the kernel computation time and the SVM training time. We present remote homology detection experiments based on the SCOP database where we show that profile-based string kernels used with SVM classifiers strongly outperform all recently presented supervised SVM methods. We further examine how to incorporate predicted secondary structure information into the profile kernel to obtain a small but significant performance improvement. We also show how we can use the learned SVM classifier to extract “discriminative sequence motifs”—short regions of the original profile that contribute almost all the weight of the SVM classification score—and show that these discriminative motifs correspond to meaningful structural features in the protein data. The use of PSI-BLAST profiles can be seen as a semi-supervised learning technique, since PSI-BLAST leverages unlabeled data from a large sequence database to build more informative profiles. Recently presented “cluster kernels” give general semi-supervised methods for improving SVM protein classification performance. We show that our profile kernel results also outperform cluster kernels while providing much better scalability to large datasets.},
keywords = {Protein Remote Homology Detection, String Kernels},
pubstate = {published},
tppubtype = {article}
}
Kuang, Rui; Weston, Jason; Noble, William Stafford; Leslie, Christina
Motif-based protein ranking by network propagation Journal Article
In: Bioinformatics, vol. 21, no. 19, 2005.
Abstract | Links | BibTeX | Tags: Protein Remote Homology Detection
@article{kuang2005motif,
title = {Motif-based protein ranking by network propagation},
author = {Rui Kuang and Jason Weston and William Stafford Noble and Christina Leslie},
url = {http://bioinformatics.oxfordjournals.org/content/21/19/3711.full},
doi = {10.1093/bioinformatics/bti608},
year = {2005},
date = {2005-08-02},
journal = {Bioinformatics},
volume = {21},
number = {19},
publisher = {Oxford Univ Press},
abstract = {Motivation: Sequence similarity often suggests evolutionary relationships between protein sequences that can be important for inferring similarity of structure or function. The most widely-used pairwise sequence comparison algorithms for homology detection, such as BLAST and PSI-BLAST, often fail to detect less conserved remotely-related targets.
Results: In this paper, we propose a new general graph-based propagation algorithm called MotifProp to detect more subtle similarity relationships than pairwise comparison methods. MotifProp is based on a protein-motif network, in which edges connect proteins and the k-mer based motif features that they contain. We show that our new motif-based propagation algorithm can improve the ranking results over a base algorithm, such as PSI-BLAST, that is used to initialize the ranking. Despite the complex structure of the protein-motif network, MotifProp can be easily interpreted using the top-ranked motifs and motif-rich regions induced by the propagation, both of which are helpful for discovering conserved structural components in remote homologies.},
keywords = {Protein Remote Homology Detection},
pubstate = {published},
tppubtype = {article}
}
Results: In this paper, we propose a new general graph-based propagation algorithm called MotifProp to detect more subtle similarity relationships than pairwise comparison methods. MotifProp is based on a protein-motif network, in which edges connect proteins and the k-mer based motif features that they contain. We show that our new motif-based propagation algorithm can improve the ranking results over a base algorithm, such as PSI-BLAST, that is used to initialize the ranking. Despite the complex structure of the protein-motif network, MotifProp can be easily interpreted using the top-ranked motifs and motif-rich regions induced by the propagation, both of which are helpful for discovering conserved structural components in remote homologies.
2004
Leslie, Christina; Kuang, Rui
Fast string kernels using inexact matching for protein sequences Journal Article
In: Journal of Machine Learning Research, vol. 5, no. Nov, 2004.
Abstract | Links | BibTeX | Tags: Protein Remote Homology Detection, String Kernels
@article{leslie2004fast,
title = {Fast string kernels using inexact matching for protein sequences},
author = {Christina Leslie and Rui Kuang},
url = {http://jmlr.csail.mit.edu/papers/volume5/leslie04a/leslie04a.pdf},
year = {2004},
date = {2004-11-01},
journal = {Journal of Machine Learning Research},
volume = {5},
number = {Nov},
abstract = {We describe several families of k-mer based string kernels related to the recently presented mismatch kernel and designed for use with support vector machines (SVMs) for classification of protein sequence data. These new kernels – restricted gappy kernels, substitution kernels, and wildcard kernels – are based on feature spaces indexed by k-length subsequences (“k-mers”) from the string alphabet Σ. However, for all kernels we define here, the kernel value K(x,y) can be computed in O(cK(|x| + |y|)) time, where the constant cK depends on the parameters of the kernel but is independent of the size |Σ| of the alphabet. Thus the computation of these kernels is linear in the length of the sequences, like the mismatch kernel, but we improve upon the parameter-dependent constant cK = k m+1 |Σ| m of the (k,m)-mismatch kernel. We compute the kernels efficiently using a trie data structure and relate our new kernels to the recently described transducer formalism. In protein classification experiments on two benchmark SCOP data sets, we show that our new faster kernels achieve SVM classification performance comparable to the mismatch kernel and the Fisher kernel derived from profile hidden Markov models, and we investigate the dependence of the kernels on parameter choice.},
keywords = {Protein Remote Homology Detection, String Kernels},
pubstate = {published},
tppubtype = {article}
}
Kuang, Rui; Ie, Eugene; Wang, Ke; Wang, Kai; Siddiqi, Mahira; Freund, Yoav; Leslie, Christina
Profile-based string kernels for remote homology detection and motif extraction Proceedings Article
In: CSB 2004, IEEE, 2004, ISBN: 0-7695-2194-0.
Abstract | Links | BibTeX | Tags: Protein Remote Homology Detection, String Kernels
@inproceedings{kuang2005profileb,
title = {Profile-based string kernels for remote homology detection and motif extraction},
author = {Rui Kuang and Eugene Ie and Ke Wang and Kai Wang and Mahira Siddiqi and Yoav Freund and Christina Leslie},
url = {http://compbio.cs.umn.edu/paper/profile-kernel.pdf},
doi = {10.1109/CSB.2004.1332428},
isbn = {0-7695-2194-0},
year = {2004},
date = {2004-08-19},
booktitle = {CSB 2004},
publisher = {IEEE},
abstract = {We introduce novel profile-based string kernels for use with support vector machines (SVMs) for the problems of protein classification and remote homology detection. These kernels use probabilistic profiles, such as those produced by the PSI-BLAST algorithm, to define position-dependent mutation neighborhoods along protein sequences for inexact matching of k-length subsequences ("k-mers") in the data. By use of an efficient data structure, the kernels are fast to compute once the profiles have been obtained. For example, the time needed to run PSI-BLAST in order to build the profiles is significantly longer than both the kernel computation time and the SVM training time. We present remote homology detection experiments based on the SCOP database where we show that profile-based string kernels used with SVM classifiers strongly outperform all recently presented supervised SVM methods. We also show how we can use the learned SVM classifier to extract "discriminative sequence motifs" - short regions of the original profile that contribute almost all the weight of the SVM classification score - and show that these discriminative motifs correspond to meaningful structural features in the protein data. The use of PSI-BLAST profiles can be seen as a semi-supervised learning technique, since PSI-BLAST leverages unlabeled data from a large sequence database to build more informative profiles. Recently presented "cluster kernels " give general semi-supervised methods for improving SVM protein classification performance. We show that our profile kernel results are comparable to cluster kernels while providing much better scalability to large datasets.},
keywords = {Protein Remote Homology Detection, String Kernels},
pubstate = {published},
tppubtype = {inproceedings}
}
Leslie, Christina; Kuang, Rui; Eskin, Eleazar
Inexact matching string kernels for protein classification Book
MIT Press, Cambridge, MA, 2004, ISBN: 9780262256926.
BibTeX | Tags: Protein Remote Homology Detection, String Kernels
@book{leslie2004inexact,
title = {Inexact matching string kernels for protein classification},
author = {Christina Leslie and Rui Kuang and Eleazar Eskin},
isbn = {9780262256926},
year = {2004},
date = {2004-01-01},
journal = {Kernel Methods in Computational Biology},
volume = {1},
pages = {95--112},
publisher = {MIT Press, Cambridge, MA},
keywords = {Protein Remote Homology Detection, String Kernels},
pubstate = {published},
tppubtype = {book}
}
2003
Leslie, Christina; Kuang, Rui
Fast kernels for inexact string matching Proceedings Article
In: 16th Annual Conference on Computational Learning Theory and 7th Kernel Workshop (COLT/Kernel), Springer, 2003, ISBN: 978-3-540-45167-9.
Abstract | Links | BibTeX | Tags: Protein Remote Homology Detection, String Kernels
@inproceedings{leslie2003fast,
title = {Fast kernels for inexact string matching},
author = {Christina Leslie and Rui Kuang},
url = {http://link.springer.com/content/pdf/10.1007%2F978-3-540-45167-9_10.pdf},
doi = {10.1007/978-3-540-45167-9_10},
isbn = {978-3-540-45167-9},
year = {2003},
date = {2003-01-01},
booktitle = {16th Annual Conference on Computational Learning Theory and 7th Kernel Workshop (COLT/Kernel)},
volume = {2777},
publisher = {Springer},
abstract = {We introduce several new families of string kernels designed in particular for use with support vector machines (SVMs) for classification of protein sequence data. These kernels – restricted gappy kernels, substitution kernels, and wildcard kernels – are based on feature spaces indexed by k-length subsequences from the string alphabet Σ (or the alphabet augmented by a wildcard character), and hence they are related to the recently presented (k,m)-mismatch kernel and string kernels used in text classification. However, for all kernels we define here, the kernel value K(x,y) can be computed in O(cK(|x| + |y|)) time, where the constant cK depends on the parameters of the kernel but is independent of the size |Σ| of the alphabet. Thus the computation of these kernels is linear in the length of the sequences, like the mismatch kernel, but we improve upon the parameter-dependent constant cK=km+1|Σ|mcK=km+1|Σ|m of the mismatch kernel. We compute the kernels efficiently using a recursive function based on a trie data structure and relate our new kernels to the recently described transducer formalism. Finally, we report protein classification experiments on a benchmark SCOP dataset, where we show that our new faster kernels achieve SVM classification performance comparable to the mismatch kernel and the Fisher kernel derived from profile hidden Markov models.},
keywords = {Protein Remote Homology Detection, String Kernels},
pubstate = {published},
tppubtype = {inproceedings}
}