2016
Liang, Lining; Sun, Hao; Zhang, Wei; Zhang, Mengdan; Yang, Xiao; Kuang, Rui; Zheng, Hui
Meta-Analysis of EMT Datasets Reveals Different Types of EMT. Journal Article
In: PloS one, vol. 11, no. 6, pp. e0156839–e0156839, 2016.
Abstract | Links | BibTeX | Tags: Gene Expression, Transcriptome
@article{liang2015meta,
title = {Meta-Analysis of EMT Datasets Reveals Different Types of EMT.},
author = {Lining Liang and Hao Sun and Wei Zhang and Mengdan Zhang and Xiao Yang and Rui Kuang and Hui Zheng},
url = {http://journals.plos.org/plosone/article?id=10.1371%2Fjournal.pone.0156839},
doi = {10.1371/journal.pone.0156839},
year = {2016},
date = {2016-06-03},
journal = {PloS one},
volume = {11},
number = {6},
pages = {e0156839--e0156839},
abstract = {As a critical process during embryonic development, cancer progression and cell fate conversions, epithelial-mesenchymal transition (EMT) has been extensively studied over the last several decades. To further understand the nature of EMT, we performed meta-analysis of multiple microarray datasets to identify the related generic signature. In this study, 24 human and 17 mouse microarray datasets were integrated to identify conserved gene expression changes in different types of EMT. Our integrative analysis revealed that there is low agreement among the list of the identified signature genes and three other lists in previous studies. Since removing the datasets with weakly-induced EMT from the analysis did not significantly improve the overlapping in the signature-gene lists, we hypothesized the existence of different types of EMT. This hypothesis was further supported by the grouping of 74 human EMT-induction samples into five distinct clusters, and the identification of distinct pathways in these different clusters of EMT samples. The five clusters of EMT-induction samples also improves the understanding of the characteristics of different EMT types. Therefore, we concluded the existence of different types of EMT was the possible reason for its complex role in multiple biological processes.},
keywords = {Gene Expression, Transcriptome},
pubstate = {published},
tppubtype = {article}
}
2012
Zhang, Wei; Johnson, Nicholas; Wu, Baolin; Kuang, Rui
Signed network propagation for detecting differential gene expressions and DNA copy number variations Proceedings Article
In: Proceedings of the ACM Conference on Bioinformatics, Computational Biology and Biomedicine, pp. 337–344, ACM 2012.
Abstract | Links | BibTeX | Tags: DNA Copy Number Variation, Gene Expression, Semi-supervised Learning
@inproceedings{zhang2012signedb,
title = {Signed network propagation for detecting differential gene expressions and DNA copy number variations},
author = {Wei Zhang and Nicholas Johnson and Baolin Wu and Rui Kuang},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/SignedNP-1.pdf},
year = {2012},
date = {2012-10-07},
booktitle = {Proceedings of the ACM Conference on Bioinformatics, Computational Biology and Biomedicine},
pages = {337--344},
organization = {ACM},
abstract = {Network propagation algorithms have proved useful for the analysis of high-dimensional genomic data. One limitation is that the current formulation only allows network propagation on positively weighted graphs. In this paper, we explore two signed network propagation algorithms and general optimization frameworks for detecting differential gene expressions and DNA copy number variations (CNV). The proposed algorithms consider both positive and negative relations in graphs to model gene up/down-regulation or amplification/deletion CNV events. The first algorithm (Signed-NP) integrates gene co-expressions and differential expressions for consistent and robust gene selection from microarray datasets by propagation on gene correlation graphs. The second algorithm (Signed-NPBi) identifies gene or CNV markers by propagation on sample-feature bipartite graphs to capture bi-clusters between samples and genomic features. Large scale experiments on several microarray gene expression datasets and CNV datasets validate that Signed-NP and Signed-NPBi perform better classification of gene expression and CNV data than standard network propagation. The experiments also demonstrate that Signed-NP is capable of selecting genes that are more biologically interpretable and consistent across multiple datasets, and Signed-NPBi can detect hidden CNV patterns in bi-clusters by smoothing on correlations between adjacent probes.},
keywords = {DNA Copy Number Variation, Gene Expression, Semi-supervised Learning},
pubstate = {published},
tppubtype = {inproceedings}
}
2010
Zhang, Wei; Hwang, Baryun; Wu, Baolin; Kuang, Rui
Network propagation models for gene selection Proceedings Article
In: 2010 IEEE International Workshop on Genomic Signal Processing and Statistics (GENSIPS), IEEE, 2010, ISBN: 978-1-61284-791-7.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Gene Expression, Semi-supervised Learning
@inproceedings{zhang2010network,
title = {Network propagation models for gene selection},
author = {Wei Zhang and Baryun Hwang and Baolin Wu and Rui Kuang},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/NP.pdf},
doi = {10.1109/GENSIPS.2010.5719689},
isbn = {978-1-61284-791-7},
year = {2010},
date = {2010-10-12},
booktitle = {2010 IEEE International Workshop on Genomic Signal Processing and Statistics (GENSIPS)},
publisher = {IEEE},
abstract = {In this paper, we explore several network propagation methods for gene selection from microarray gene expression datasets. The network propagation methods capture gene co-expression and differential expression with unified machine learning frameworks. Large scale experiments on five breast cancer datasets validated that the network propagation methods are capable of selecting genes that are more biologically interpretable and more consistent across multiple datasets, compared with the existing approaches.},
keywords = {Cancer Genomics, Gene Expression, Semi-supervised Learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Fang, Gang; Kuang, Rui; Pandey, Gaurav; Steinbach, Michael; Myers, Chad L; Kumar, Vipin
Subspace differential coexpression analysis: problem definition and a general approach. Proceedings Article
In: Pacific symposium on biocomputing, pp. 145–56, 2010.
Abstract | Links | BibTeX | Tags: Gene Expression, Transcriptome
@inproceedings{fang2010subspace,
title = {Subspace differential coexpression analysis: problem definition and a general approach.},
author = {Gang Fang and Rui Kuang and Gaurav Pandey and Michael Steinbach and Chad L Myers and Vipin Kumar},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/9789814295291_0017.pdf},
doi = {10.1142/9789814295291_0017},
year = {2010},
date = {2010-01-04},
booktitle = {Pacific symposium on biocomputing},
volume = {15},
pages = {145--56},
abstract = {In this paper, we study methods to identify differential coexpression patterns in case-control gene expression data. A differential coexpression pattern consists of a set of genes that have substantially different levels of coherence of their expression profiles across the two sample-classes, i.e., highly coherent in one class, but not in the other. Biologically, a differential coexpression patterns may indicate the disruption of a regulatory mechanism possibly caused by disregulation of pathways or mutations of transcription factors. A common feature of all the existing approaches for differential coexpression analysis is that the coexpression of a set of genes is measured on all the samples in each of the two classes, i.e., over the full-space of samples. Hence, these approaches may miss patterns that only cover a subset of samples in each class, i.e., subspace patterns, due to the heterogeneity of the subject population and disease causes. In this paper, we extend differential coexpression analysis by defining a subspace differential coexpression pattern, i.e., a set of genes that are coexpressed in a relatively large percent of samples in one class, but in a much smaller percent of samples in the other class. We propose a general approach based upon association analysis framework that allows exhaustive yet efficient discovery of subspace differential coexpression patterns. This approach can be used to adapt a family of biclustering algorithms to obtain their corresponding differential versions that can directly discover differential coexpression patterns. Using a recently developed biclustering algorithm as illustration, we perform experiments on cancer datasets which demonstrates the existence of subspace differential coexpression patterns. Permutation tests demonstrate the statistical significance for a large number of discovered subspace patterns, many of which can not be discovered if they are measured over all the samples in each of the classes. Interestingly, in our experiments, some discovered subspace patterns have significant overlap with known cancer pathways, and some are enriched with the target gene sets of cancer-related microRNA and transcription factors. The source codes and datasets used in this paper are available at http://vk.cs.umn.edu/SDC/.
Read More: http://www.worldscientific.com/doi/abs/10.1142/9789814295291_0017},
keywords = {Gene Expression, Transcriptome},
pubstate = {published},
tppubtype = {inproceedings}
}
Read More: http://www.worldscientific.com/doi/abs/10.1142/9789814295291_0017
2009
Gupta, Rohit; Agrawal, Smita; Rao, Navneet; Tian, Ze; Kuang, Rui; Kumar, Vipin
Integrative Biomarker Discovery for Breast Cancer Metastasis from Gene Expression and Protein Interaction Data Using Error-tolerant Pattern Mining Proceedings Article
In: Citeseer, 2009.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Gene Expression
@inproceedings{mining2009integrative,
title = {Integrative Biomarker Discovery for Breast Cancer Metastasis from Gene Expression and Protein Interaction Data Using Error-tolerant Pattern Mining},
author = {Rohit Gupta and Smita Agrawal and Navneet Rao and Ze Tian and Rui Kuang and Vipin Kumar},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/BICOB2010.pdf},
year = {2009},
date = {2009-11-29},
publisher = {Citeseer},
abstract = {Biomarker discovery for complex diseases is a challenging
problem. Most of the existing approaches identify
individual genes as disease markers, thereby missing the
interactions among genes. Moreover, often only single biological
data source is used to discover biomarkers. These
factors account for the discovery of inconsistent biomarkers.
In this paper, we propose a novel error-tolerant pattern
mining approach for integrated analysis of gene expression
and protein interaction data. This integrated approach incorporates
constraints from protein interaction network and
efficiently discovers patterns (groups of genes) in a bottomup
fashion from the gene-expression data. We call these
patterns active sub-network biomarkers. To illustrate the
efficacy of our proposed approach, we used four breast cancer
gene expression data sets and a human protein interaction
network and showed that active sub-network biomarkers
are more biologically plausible and genes discovered
are more reproducible across studies. Finally, through pathway
analysis, we also showed a substantial enrichment for
known cancer genes and hence were able to generate relevant
hypotheses for understanding the molecular mechanisms
of breast cancer metastasis.},
keywords = {Cancer Genomics, Gene Expression},
pubstate = {published},
tppubtype = {inproceedings}
}
problem. Most of the existing approaches identify
individual genes as disease markers, thereby missing the
interactions among genes. Moreover, often only single biological
data source is used to discover biomarkers. These
factors account for the discovery of inconsistent biomarkers.
In this paper, we propose a novel error-tolerant pattern
mining approach for integrated analysis of gene expression
and protein interaction data. This integrated approach incorporates
constraints from protein interaction network and
efficiently discovers patterns (groups of genes) in a bottomup
fashion from the gene-expression data. We call these
patterns active sub-network biomarkers. To illustrate the
efficacy of our proposed approach, we used four breast cancer
gene expression data sets and a human protein interaction
network and showed that active sub-network biomarkers
are more biologically plausible and genes discovered
are more reproducible across studies. Finally, through pathway
analysis, we also showed a substantial enrichment for
known cancer genes and hence were able to generate relevant
hypotheses for understanding the molecular mechanisms
of breast cancer metastasis.
Tian, Ze; Hwang, TaeHyun; Kuang, Rui
A hypergraph-based learning algorithm for classifying gene expression and arrayCGH data with prior knowledge Journal Article
In: Bioinformatics, vol. 25, no. 21, pp. 2831–2838, 2009, ISSN: 1460-2059.
Abstract | Links | BibTeX | Tags: DNA Copy Number Variation, Gene Expression, Semi-supervised Learning
@article{tian2009hypergraph,
title = {A hypergraph-based learning algorithm for classifying gene expression and arrayCGH data with prior knowledge},
author = {Ze Tian and TaeHyun Hwang and Rui Kuang},
url = {http://bioinformatics.oxfordjournals.org/content/25/21/2831.short},
doi = {10.1093/bioinformatics/btp467},
issn = {1460-2059},
year = {2009},
date = {2009-07-27},
journal = {Bioinformatics},
volume = {25},
number = {21},
pages = {2831--2838},
publisher = {Oxford Univ Press},
abstract = {Motivation: Incorporating biological prior knowledge into predictive models is a challenging data integration problem in analyzing high-dimensional genomic data. We introduce a hypergraph-based semi-supervised learning algorithm called HyperPrior to classify gene expression and array-based comparative genomic hybridization (arrayCGH) data using biological knowledge as constraints on graph-based learning. HyperPrior is a robust two-step iterative method that alternatively finds the optimal labeling of the samples and the optimal weighting of the features, guided by constraints encoding prior knowledge. The prior knowledge for analyzing gene expression data is that cancer-related genes tend to interact with each other in a protein–protein interaction network. Similarly, the prior knowledge for analyzing arrayCGH data is that probes that are spatially nearby in their layout along the chromosomes tend to be involved in the same amplification or deletion event. Based on the prior knowledge, HyperPrior imposes a consistent weighting of the correlated genomic features in graph-based learning.
Results: We applied HyperPrior to test two arrayCGH datasets and two gene expression datasets for both cancer classification and biomarker identification. On all the datasets, HyperPrior achieved competitive classification performance, compared with SVMs and the other baselines utilizing the same prior knowledge. HyperPrior also identified several discriminative regions on chromosomes and discriminative subnetworks in the PPI, both of which contain cancer-related genomic elements. Our results suggest that HyperPrior is promising in utilizing biological prior knowledge to achieve better classification performance and more biologically interpretable findings in gene expression and arrayCGH data.},
keywords = {DNA Copy Number Variation, Gene Expression, Semi-supervised Learning},
pubstate = {published},
tppubtype = {article}
}
Results: We applied HyperPrior to test two arrayCGH datasets and two gene expression datasets for both cancer classification and biomarker identification. On all the datasets, HyperPrior achieved competitive classification performance, compared with SVMs and the other baselines utilizing the same prior knowledge. HyperPrior also identified several discriminative regions on chromosomes and discriminative subnetworks in the PPI, both of which contain cancer-related genomic elements. Our results suggest that HyperPrior is promising in utilizing biological prior knowledge to achieve better classification performance and more biologically interpretable findings in gene expression and arrayCGH data.
2008
Hwang, TaeHyun; Tian, Ze; Kuang, Rui; Kocher, Jean-Pierre
Learning on weighted hypergraphs to integrate protein interactions and gene expressions for cancer outcome prediction Proceedings Article
In: 2008 Eighth IEEE International Conference on Data Mining, pp. 293–302, IEEE 2008, ISBN: 978-0-7695-3502-9.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Gene Expression, Protein-Protein Interaction Network, Semi-supervised Learning
@inproceedings{hwang2008learning,
title = {Learning on weighted hypergraphs to integrate protein interactions and gene expressions for cancer outcome prediction},
author = {TaeHyun Hwang and Ze Tian and Rui Kuang and Jean-Pierre Kocher},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/HyperGene.pdf},
doi = {10.1109/ICDM.2008.37},
isbn = {978-0-7695-3502-9},
year = {2008},
date = {2008-12-15},
booktitle = {2008 Eighth IEEE International Conference on Data Mining},
pages = {293--302},
organization = {IEEE},
abstract = {Abstract:
Building reliable predictive models from multiple complementary genomic data for cancer study is a crucial step towards successful cancer treatment and a full understanding of the underlying biological principles. To tackle this challenging data integration problem, we propose a hypergraph-based learning algorithm called HyperGene to integrate microarray gene expressions and protein-protein interactions for cancer outcome prediction and biomarker identification. HyperGene is a robust two-step iterative method that alternatively finds the optimal outcome prediction and the optimal weighting of the marker genes guided by a protein-protein interaction network. Under the hypothesis that cancer-related genes tend to interact with each other, the HyperGene algorithm uses a protein-protein interaction network as prior knowledge by imposing a consistent weighting of interacting genes. Our experimental results on two large-scale breast cancer gene expression datasets show that HyperGene utilizing a curated protein-protein interaction network achieves significantly improved cancer outcome prediction. Moreover, HyperGene can also retrieve many known cancer genes as highly weighted marker genes.},
keywords = {Cancer Genomics, Gene Expression, Protein-Protein Interaction Network, Semi-supervised Learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Building reliable predictive models from multiple complementary genomic data for cancer study is a crucial step towards successful cancer treatment and a full understanding of the underlying biological principles. To tackle this challenging data integration problem, we propose a hypergraph-based learning algorithm called HyperGene to integrate microarray gene expressions and protein-protein interactions for cancer outcome prediction and biomarker identification. HyperGene is a robust two-step iterative method that alternatively finds the optimal outcome prediction and the optimal weighting of the marker genes guided by a protein-protein interaction network. Under the hypothesis that cancer-related genes tend to interact with each other, the HyperGene algorithm uses a protein-protein interaction network as prior knowledge by imposing a consistent weighting of interacting genes. Our experimental results on two large-scale breast cancer gene expression datasets show that HyperGene utilizing a curated protein-protein interaction network achieves significantly improved cancer outcome prediction. Moreover, HyperGene can also retrieve many known cancer genes as highly weighted marker genes.
Hwang, TaeHyun; Sicotte, Hugues; Tian, Ze; Wu, Baolin; Kocher, Jean-Pierre; Wigle, Dennis A; Kumar, Vipin; Kuang, Rui
Robust and efficient identification of biomarkers by classifying features on graphs Journal Article
In: Bioinformatics, vol. 24, no. 18, pp. 2023–2029, 2008, ISBN: 1460-2059.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Gene Expression, Semi-supervised Learning
@article{hwang2008robustb,
title = {Robust and efficient identification of biomarkers by classifying features on graphs},
author = {Hwang, TaeHyun and Sicotte, Hugues and Tian, Ze and Wu, Baolin and Kocher, Jean-Pierre and Wigle, Dennis A and Kumar, Vipin and Kuang, Rui},
url = {http://bioinformatics.oxfordjournals.org/content/24/18/2023.short},
doi = {10.1093/bioinformatics/btn383},
isbn = {1460-2059},
year = {2008},
date = {2008-01-01},
journal = {Bioinformatics},
volume = {24},
number = {18},
pages = {2023--2029},
publisher = {Oxford Univ Press},
abstract = {Motivation: A central problem in biomarker discovery from large-scale gene expression or single nucleotide polymorphism (SNP) data is the computational challenge of taking into account the dependence among all the features. Methods that ignore the dependence usually identify non-reproducible biomarkers across independent datasets. We introduce a new graph-based semi-supervised feature classification algorithm to identify discriminative disease markers by learning on bipartite graphs. Our algorithm directly classifies the feature nodes in a bipartite graph as positive, negative or neutral with network propagation to capture the dependence among both samples and features (clinical and genetic variables) by exploring bi-cluster structures in a graph. Two features of our algorithm are: (1) our algorithm can find a global optimal labeling to capture the dependence among all the features and thus, generates highly reproducible results across independent microarray or other high-thoughput datasets, (2) our algorithm is capable of handling hundreds of thousands of features and thus, is particularly useful for biomarker identification from high-throughput gene expression and SNP data. In addition, although designed for classifying features, our algorithm can also simultaneously classify test samples for disease prognosis/diagnosis.
Results: We applied the network propagation algorithm to study three large-scale breast cancer datasets. Our algorithm achieved competitive classification performance compared with SVMs and other baseline methods, and identified several markers with clinical or biological relevance with the disease. More importantly, our algorithm also identified highly reproducible marker genes and enriched functions from the independent datasets.
Availability: Supplementary results and source code are available at http://localhost/~raphaelpetegrosso/wpcb/Feature_Class.},
keywords = {Cancer Genomics, Gene Expression, Semi-supervised Learning},
pubstate = {published},
tppubtype = {article}
}
Results: We applied the network propagation algorithm to study three large-scale breast cancer datasets. Our algorithm achieved competitive classification performance compared with SVMs and other baseline methods, and identified several markers with clinical or biological relevance with the disease. More importantly, our algorithm also identified highly reproducible marker genes and enriched functions from the independent datasets.
Availability: Supplementary results and source code are available at http://localhost/~raphaelpetegrosso/wpcb/Feature_Class.