2017
Zhang, Wei; Chien, Jeremy; Yong, Jeongsik; Kuang, Rui
Network-based Machine Learning and Graph Theory Algorithms for Precision Oncology Journal Article
In: NPJ Precision Oncology, no. 25, 2017.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Network-based Learning, Phenome-genome Association, Protein-Protein Interaction Network, Semi-supervised Learning
@article{networkreview2017,
title = {Network-based Machine Learning and Graph Theory Algorithms for Precision Oncology},
author = {Wei Zhang and Jeremy Chien and Jeongsik Yong and Rui Kuang},
url = {https://www.nature.com/articles/s41698-017-0029-7},
doi = {doi:10.1038/s41698-017-0029-7},
year = {2017},
date = {2017-08-08},
journal = {NPJ Precision Oncology},
number = {25},
abstract = {Network-based analytics plays an increasingly important role in precision oncology. Growing evidence in recent studies suggests that cancer can be better understood through mutated or dysregulated pathways or networks rather than individual mutations and that the efficacy of repositioned drugs can be inferred from disease modules in molecular networks. This article reviews network-based machine learning and graph theory algorithms for integrative analysis of personal genomic data and biomedical knowledge bases to identify tumor-specific molecular mechanisms, candidate targets and repositioned drugs for personalized treatment. The review focuses on the algorithmic design and mathematical formulation of these methods to facilitate applications and implementations of network-based analysis in the practice of precision oncology. We review the methods applied in three scenarios to integrate genomic data and network models in different analysis pipelines, and we examine three categories of network-based approaches for repositioning drugs in drug-disease-gene networks. In addition, we perform a comprehensive subnetwork/pathway analysis of mutations in 31 cancer genome projects in the Cancer Genome Atlas (TCGA) and present a detailed case study on ovarian cancer. Finally, we discuss interesting observations, potential pitfalls and future directions in network-based precision oncology.},
keywords = {Cancer Genomics, Network-based Learning, Phenome-genome Association, Protein-Protein Interaction Network, Semi-supervised Learning},
pubstate = {published},
tppubtype = {article}
}
2015
Zhang, Wei; Chang, Jae-Woong; Lin, Lilong; Minn, Kay; Wu, Baolin; Chien, Jeremy; Yong, Jeongsik; Zheng, Hui; Kuang, Rui
Network-based Isoform Quantification with RNA-Seq Data for Cancer Transcriptome Analysis Journal Article
In: PLoS Computational Biology, vol. e1004465, 2015.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Isoform Quantification, Network-based Learning
@article{Net-RSTQ,
title = {Network-based Isoform Quantification with RNA-Seq Data for Cancer Transcriptome Analysis},
author = {Wei Zhang and Jae-Woong Chang and Lilong Lin and Kay Minn and Baolin Wu and Jeremy Chien and Jeongsik Yong and Hui Zheng and Rui Kuang},
url = {http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004465},
doi = {http://dx.doi.org/10.1371/journal.pcbi.1004465},
year = {2015},
date = {2015-12-23},
journal = {PLoS Computational Biology},
volume = {e1004465},
abstract = {New sequencing technologies for transcriptome-wide profiling of RNAs have greatly promoted the interest in isoform-based functional characterizations of a cellular system. Elucidation of gene expressions at the isoform resolution could lead to new molecular mechanisms such as gene-regulations and alternative splicings, and potentially better molecular signals for phenotype predictions. However, it could be overly optimistic to derive the proportion of the isoforms of a gene solely based on short read alignments. Inherently, systematical sampling biases from RNA library preparation and ambiguity of read origins in overlapping isoforms pose a problem in reliability. The work in this paper exams the possibility of using protein domain-domain interactions as prior knowledge in isoform transcript quantification. We first made the observation that protein domain-domain interactions positively correlate with isoform co-expressions in TCGA data and then designed a probabilistic EM approach to integrate domain-domain interactions with short read alignments for estimation of isoform proportions. Validated by qRT-PCR experiments on three cell lines, simulations and classifications of TCGA patient samples in several cancer types, Net-RSTQ is proven a useful tool for isoform-based analysis in functional genomes and systems biology.},
keywords = {Cancer Genomics, Isoform Quantification, Network-based Learning},
pubstate = {published},
tppubtype = {article}
}
Chien, Jeremy; Sicotte, Hugues; Fan, Jian-Bing; Humphray, Sean; Cunningham, Julie M; Kalli, Kimberly R; Oberg, Ann L; Hart, Steven N; Li, Ying; Davila, Jaime I; others,
TP53 mutations, tetraploidy and homologous recombination repair defects in early stage high-grade serous ovarian cancer Journal Article
In: Nucleic acids research, pp. gkv111, 2015.
Abstract | Links | BibTeX | Tags: Cancer Genomics
@article{chien2015tp53,
title = {TP53 mutations, tetraploidy and homologous recombination repair defects in early stage high-grade serous ovarian cancer},
author = {Jeremy Chien and Hugues Sicotte and Jian-Bing Fan and Sean Humphray and Julie M Cunningham and Kimberly R Kalli and Ann L Oberg and Steven N Hart and Ying Li and Jaime I Davila and others},
url = {http://nar.oxfordjournals.org/content/43/14/6945},
doi = {10.1093/nar/gkv111},
year = {2015},
date = {2015-02-02},
journal = {Nucleic acids research},
pages = {gkv111},
publisher = {Oxford Univ Press},
abstract = {To determine early somatic changes in high-grade serous ovarian cancer (HGSOC), we performed whole genome sequencing on a rare collection of 16 low stage HGSOCs. The majority showed extensive structural alterations (one had an ultramutated profile), exhibited high levels of p53 immunoreactivity, and harboured TP53 mutation, deletion or inactivation. BRCA1 and BRCA2 mutations were observed in two tumors, with nine showing evidence of a homologous recombination (HR) defect. Combined analysis with The Cancer Genome Atlas indicated that low and late stage HGSOCs have similar mutation and copy number profiles. We also found evidence that deleterious TP53 mutations are the earliest events, followed by deletions or loss of heterozygosity (LOH) of chromosomes carrying TP53, BRCA1 or BRCA2. Inactivation of HR appears to be an early event, as 62.5% of tumours showed a LOH pattern suggestive of HR defects. Three tumours with the highest ploidy had little genome-wide LOH, yet one of these had a homozygous somatic frame-shift BRCA2 mutation, suggesting that some carcinomas begin as tetraploid then descend into diploidy accompanied by genome-wide LOH. Lastly, we found evidence that structural variants (SV) cluster in HGSOC, but are absent in one ultramutated tumor, providing insights into the pathogenesis of low stage HGSOC.},
keywords = {Cancer Genomics},
pubstate = {published},
tppubtype = {article}
}
Johnson, Nicholas; Zhang, Huanan; Fang, Gang; Kumar, Vipin; Kuang, Rui
SubPatCNV: approximate subspace pattern mining for mapping copy-number variations Journal Article
In: BMC bioinformatics, vol. 16, no. 1, pp. 1, 2015, ISSN: 1471-2105.
Abstract | Links | BibTeX | Tags: Cancer Genomics, DNA Copy Number Variation
@article{johnson2015subpatcnv,
title = {SubPatCNV: approximate subspace pattern mining for mapping copy-number variations},
author = {Nicholas Johnson and Huanan Zhang and Gang Fang and Vipin Kumar and Rui Kuang},
url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-014-0426-7},
doi = {10.1186/s12859-014-0426-7},
issn = {1471-2105},
year = {2015},
date = {2015-01-16},
journal = {BMC bioinformatics},
volume = {16},
number = {1},
pages = {1},
publisher = {BioMed Central},
abstract = {Background
Many DNA copy-number variations (CNVs) are known to lead to phenotypic variations and pathogenesis. While CNVs are often only common in a small number of samples in the studied population or patient cohort, previous work has not focused on customized identification of CNV regions that only exhibit in subsets of samples with advanced data mining techniques to reliably answer questions such as “Which are all the chromosomal fragments showing nearly identical deletions or insertions in more than 30% of the individuals?”.
Results
We introduce a tool for mining CNV subspace patterns, namely SubPatCNV, which is capable of identifying all aberrant CNV regions specific to arbitrary sample subsets larger than a support threshold. By design, SubPatCNV is the implementation of a variation of approximate association pattern mining algorithm under a spatial constraint on the positional CNV probe features. In benchmark test, SubPatCNV was applied to identify population specific germline CNVs from four populations of HapMap samples. In experiments on the TCGA ovarian cancer dataset, SubPatCNV discovered many large aberrant CNV events in patient subgroups, and reported regions enriched with cancer relevant genes. In both HapMap data and TCGA data, it was observed that SubPatCNV employs approximate pattern mining to more effectively identify CNV subspace patterns that are consistent within a subgroup from high-density array data.
Conclusions
SubPatCNV available through http://sourceforge.net/projects/subpatcnv/is a unique scalable open-source software tool that provides the flexibility of identifying CNV regions specific to sample subgroups of different sizes from high-density CNV array data.},
keywords = {Cancer Genomics, DNA Copy Number Variation},
pubstate = {published},
tppubtype = {article}
}
Many DNA copy-number variations (CNVs) are known to lead to phenotypic variations and pathogenesis. While CNVs are often only common in a small number of samples in the studied population or patient cohort, previous work has not focused on customized identification of CNV regions that only exhibit in subsets of samples with advanced data mining techniques to reliably answer questions such as “Which are all the chromosomal fragments showing nearly identical deletions or insertions in more than 30% of the individuals?”.
Results
We introduce a tool for mining CNV subspace patterns, namely SubPatCNV, which is capable of identifying all aberrant CNV regions specific to arbitrary sample subsets larger than a support threshold. By design, SubPatCNV is the implementation of a variation of approximate association pattern mining algorithm under a spatial constraint on the positional CNV probe features. In benchmark test, SubPatCNV was applied to identify population specific germline CNVs from four populations of HapMap samples. In experiments on the TCGA ovarian cancer dataset, SubPatCNV discovered many large aberrant CNV events in patient subgroups, and reported regions enriched with cancer relevant genes. In both HapMap data and TCGA data, it was observed that SubPatCNV employs approximate pattern mining to more effectively identify CNV subspace patterns that are consistent within a subgroup from high-density array data.
Conclusions
SubPatCNV available through http://sourceforge.net/projects/subpatcnv/is a unique scalable open-source software tool that provides the flexibility of identifying CNV regions specific to sample subgroups of different sizes from high-density CNV array data.
2013
Zhang, Huanan; Tian, Ze; Kuang, Rui
Transfer learning across cancers on DNA copy number variation analysis Proceedings Article
In: 2013 IEEE 13th International Conference on Data Mining, pp. 1283–1288, IEEE IEEE, 2013, ISBN: 978-0-7695-5108-1.
Abstract | Links | BibTeX | Tags: Cancer Genomics, DNA Copy Number Variation, Transfer Learning
@inproceedings{zhang2013transfer,
title = {Transfer learning across cancers on DNA copy number variation analysis},
author = {Huanan Zhang and Ze Tian and Rui Kuang},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/TLFL-10Page.pdf},
doi = {10.1109/ICDM.2013.58},
isbn = {978-0-7695-5108-1},
year = {2013},
date = {2013-12-07},
booktitle = {2013 IEEE 13th International Conference on Data Mining},
pages = {1283--1288},
publisher = {IEEE},
organization = {IEEE},
abstract = {Abstract:
DNA copy number variations (CNVs) are prevalent in all types of tumors. It is still a challenge to study how CNVs play a role in driving tumorgenic mechanisms that are either universal or specific in different cancer types. To address the problem, we introduce a transfer learning framework to discover common CNVs shared across different tumor types as well as CNVs specific to each tumor type from genome-wide CNV data measured by array CGH and SNP genotyping array. The proposed model, namely Transfer Learning with Fused LASSO (TLFL), detects latent CNV components from multiple CNV datasets of different tumor types to distinguish the CNVs that are common across the datasets and those that are specific in each dataset. Both the common and type-specific CNVs are detected as latent components in matrix factorization coupled with fused LASSO on adjacent CNV probe features. TLFL considers the common latent components underlying the multiple datasets to transfer knowledge across different tumor types. In simulations and experiments on real cancer CNV datasets, TLFL detected better latent components that can be used as features to improve classification of patient samples in each individual dataset compared with the model without the knowledge transfer. In cross-dataset analysis on bladder cancer and cross-domain analysis on breast cancer and ovarian cancer, TLFL also learned latent CNV components that are both predictive of tumor stages and correlate with known cancer genes.},
keywords = {Cancer Genomics, DNA Copy Number Variation, Transfer Learning},
pubstate = {published},
tppubtype = {inproceedings}
}
DNA copy number variations (CNVs) are prevalent in all types of tumors. It is still a challenge to study how CNVs play a role in driving tumorgenic mechanisms that are either universal or specific in different cancer types. To address the problem, we introduce a transfer learning framework to discover common CNVs shared across different tumor types as well as CNVs specific to each tumor type from genome-wide CNV data measured by array CGH and SNP genotyping array. The proposed model, namely Transfer Learning with Fused LASSO (TLFL), detects latent CNV components from multiple CNV datasets of different tumor types to distinguish the CNVs that are common across the datasets and those that are specific in each dataset. Both the common and type-specific CNVs are detected as latent components in matrix factorization coupled with fused LASSO on adjacent CNV probe features. TLFL considers the common latent components underlying the multiple datasets to transfer knowledge across different tumor types. In simulations and experiments on real cancer CNV datasets, TLFL detected better latent components that can be used as features to improve classification of patient samples in each individual dataset compared with the model without the knowledge transfer. In cross-dataset analysis on bladder cancer and cross-domain analysis on breast cancer and ovarian cancer, TLFL also learned latent CNV components that are both predictive of tumor stages and correlate with known cancer genes.
Chien, Jeremy; Kuang, Rui; Landen, Charles; Shridhar, Viji
Platinum-sensitive recurrence in ovarian cancer: the role of tumor microenvironment Journal Article
In: Frontiers in oncology, vol. 3, pp. 251, 2013.
Abstract | Links | BibTeX | Tags: Cancer Genomics
@article{chien2013platinumb,
title = {Platinum-sensitive recurrence in ovarian cancer: the role of tumor microenvironment},
author = {Jeremy Chien and Rui Kuang and Charles Landen and Viji Shridhar},
url = {http://journal.frontiersin.org/article/10.3389/fonc.2013.00251/full},
doi = {10.3389/fonc.2013.00251},
year = {2013},
date = {2013-09-23},
journal = {Frontiers in oncology},
volume = {3},
pages = {251},
publisher = {Frontiers},
abstract = {Despite several advances in the understanding of ovarian cancer pathobiology, in terms of driver genetic alterations in high-grade serous cancer, histologic heterogeneity of epithelial ovarian cancer, cell-of-origin for ovarian cancer, the survival rate from ovarian cancer is disappointingly low when compared to that of breast or prostate cancer. One of the factors contributing to the poor survival rate from ovarian cancer is the development of chemotherapy resistance following several rounds of chemotherapy. Although unicellular drug resistance mechanisms contribute to chemotherapy resistance, tumor microenvironment and the extracellular matrix (ECM), in particular, is emerging as a significant determinant of a tumor’s response to chemotherapy. In this review, we discuss the potential role of the tumor microenvironment in ovarian cancer recurrence and resistance to chemotherapy. Finally, we propose an alternative view of platinum-sensitive recurrence to describe a potential role of the ECM in the process.},
keywords = {Cancer Genomics},
pubstate = {published},
tppubtype = {article}
}
Hwang, TaeHyun; Atluri, Gowtham; Kuang, Rui; Kumar, Vipin; Starr, Timothy; Silverstein, Kevin AT; Haverty, Peter M; Zhang, Zemin; Liu, Jinfeng
Large-scale integrative network-based analysis identifies common pathways disrupted by copy number alterations across cancers Journal Article
In: BMC genomics, vol. 14, no. 1, pp. 440, 2013.
Abstract | Links | BibTeX | Tags: Cancer Genomics, DNA Copy Number Variation, Network-based Learning
@article{hwang2013large,
title = {Large-scale integrative network-based analysis identifies common pathways disrupted by copy number alterations across cancers},
author = {TaeHyun Hwang and Gowtham Atluri and Rui Kuang and Vipin Kumar and Timothy Starr and Kevin AT Silverstein and Peter M Haverty and Zemin Zhang and Jinfeng Liu},
url = {http://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-14-440},
doi = {10.1186/1471-2164-14-440},
year = {2013},
date = {2013-07-03},
journal = {BMC genomics},
volume = {14},
number = {1},
pages = {440},
publisher = {BioMed Central Ltd},
abstract = {Many large-scale studies analyzed high-throughput genomic data to identify altered pathways essential to the development and progression of specific types of cancer. However, no previous study has been extended to provide a comprehensive analysis of pathways disrupted by copy number alterations across different human cancers. Towards this goal, we propose a network-based method to integrate copy number alteration data with human protein-protein interaction networks and pathway databases to identify pathways that are commonly disrupted in many different types of cancer.},
keywords = {Cancer Genomics, DNA Copy Number Variation, Network-based Learning},
pubstate = {published},
tppubtype = {article}
}
Zhang, Wei; Ota, Takayo; Shridhar, Viji; Chien, Jeremy; Wu, Baolin; Kuang, Rui
Network-based survival analysis reveals subnetwork signatures for predicting outcomes of ovarian cancer treatment Journal Article
In: PLoS Comput Biol, vol. 9, no. 3, pp. e1002975, 2013.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Network-based Learning, Survival Analysis, Transcriptome
@article{zhang2013network,
title = {Network-based survival analysis reveals subnetwork signatures for predicting outcomes of ovarian cancer treatment},
author = {Wei Zhang and Takayo Ota and Viji Shridhar and Jeremy Chien and Baolin Wu and Rui Kuang},
url = {http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1002975},
doi = {10.1371/journal.pcbi.1002975},
year = {2013},
date = {2013-03-21},
journal = {PLoS Comput Biol},
volume = {9},
number = {3},
pages = {e1002975},
publisher = {Public Library of Science},
abstract = {Cox regression is commonly used to predict the outcome by the time to an event of interest and in addition, identify relevant features for survival analysis in cancer genomics. Due to the high-dimensionality of high-throughput genomic data, existing Cox models trained on any particular dataset usually generalize poorly to other independent datasets. In this paper, we propose a network-based Cox regression model called Net-Cox and applied Net-Cox for a large-scale survival analysis across multiple ovarian cancer datasets. Net-Cox integrates gene network information into the Cox's proportional hazard model to explore the co-expression or functional relation among high-dimensional gene expression features in the gene network. Net-Cox was applied to analyze three independent gene expression datasets including the TCGA ovarian cancer dataset and two other public ovarian cancer datasets. Net-Cox with the network information from gene co-expression or functional relations identified highly consistent signature genes across the three datasets, and because of the better generalization across the datasets, Net-Cox also consistently improved the accuracy of survival prediction over the Cox models regularized by L1-norm or L2-norm. This study focused on analyzing the death and recurrence outcomes in the treatment of ovarian carcinoma to identify signature genes that can more reliably predict the events. The signature genes comprise dense protein-protein interaction subnetworks, enriched by extracellular matrix receptors and modulators or by nuclear signaling components downstream of extracellular signal-regulated kinases. In the laboratory validation of the signature genes, a tumor array experiment by protein staining on an independent patient cohort from Mayo Clinic showed that the protein expression of the signature gene FBN1 is a biomarker significantly associated with the early recurrence after 12 months of the treatment in the ovarian cancer patients who are initially sensitive to chemotherapy. Net-Cox toolbox is available at http://localhost/~raphaelpetegrosso/wpcb/Net-Cox/.},
keywords = {Cancer Genomics, Network-based Learning, Survival Analysis, Transcriptome},
pubstate = {published},
tppubtype = {article}
}
2010
Zhang, Wei; Hwang, Baryun; Wu, Baolin; Kuang, Rui
Network propagation models for gene selection Proceedings Article
In: 2010 IEEE International Workshop on Genomic Signal Processing and Statistics (GENSIPS), IEEE, 2010, ISBN: 978-1-61284-791-7.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Gene Expression, Semi-supervised Learning
@inproceedings{zhang2010network,
title = {Network propagation models for gene selection},
author = {Wei Zhang and Baryun Hwang and Baolin Wu and Rui Kuang},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/NP.pdf},
doi = {10.1109/GENSIPS.2010.5719689},
isbn = {978-1-61284-791-7},
year = {2010},
date = {2010-10-12},
booktitle = {2010 IEEE International Workshop on Genomic Signal Processing and Statistics (GENSIPS)},
publisher = {IEEE},
abstract = {In this paper, we explore several network propagation methods for gene selection from microarray gene expression datasets. The network propagation methods capture gene co-expression and differential expression with unified machine learning frameworks. Large scale experiments on five breast cancer datasets validated that the network propagation methods are capable of selecting genes that are more biologically interpretable and more consistent across multiple datasets, compared with the existing approaches.},
keywords = {Cancer Genomics, Gene Expression, Semi-supervised Learning},
pubstate = {published},
tppubtype = {inproceedings}
}
2009
Gupta, Rohit; Agrawal, Smita; Rao, Navneet; Tian, Ze; Kuang, Rui; Kumar, Vipin
Integrative Biomarker Discovery for Breast Cancer Metastasis from Gene Expression and Protein Interaction Data Using Error-tolerant Pattern Mining Proceedings Article
In: Citeseer, 2009.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Gene Expression
@inproceedings{mining2009integrative,
title = {Integrative Biomarker Discovery for Breast Cancer Metastasis from Gene Expression and Protein Interaction Data Using Error-tolerant Pattern Mining},
author = {Rohit Gupta and Smita Agrawal and Navneet Rao and Ze Tian and Rui Kuang and Vipin Kumar},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/BICOB2010.pdf},
year = {2009},
date = {2009-11-29},
publisher = {Citeseer},
abstract = {Biomarker discovery for complex diseases is a challenging
problem. Most of the existing approaches identify
individual genes as disease markers, thereby missing the
interactions among genes. Moreover, often only single biological
data source is used to discover biomarkers. These
factors account for the discovery of inconsistent biomarkers.
In this paper, we propose a novel error-tolerant pattern
mining approach for integrated analysis of gene expression
and protein interaction data. This integrated approach incorporates
constraints from protein interaction network and
efficiently discovers patterns (groups of genes) in a bottomup
fashion from the gene-expression data. We call these
patterns active sub-network biomarkers. To illustrate the
efficacy of our proposed approach, we used four breast cancer
gene expression data sets and a human protein interaction
network and showed that active sub-network biomarkers
are more biologically plausible and genes discovered
are more reproducible across studies. Finally, through pathway
analysis, we also showed a substantial enrichment for
known cancer genes and hence were able to generate relevant
hypotheses for understanding the molecular mechanisms
of breast cancer metastasis.},
keywords = {Cancer Genomics, Gene Expression},
pubstate = {published},
tppubtype = {inproceedings}
}
problem. Most of the existing approaches identify
individual genes as disease markers, thereby missing the
interactions among genes. Moreover, often only single biological
data source is used to discover biomarkers. These
factors account for the discovery of inconsistent biomarkers.
In this paper, we propose a novel error-tolerant pattern
mining approach for integrated analysis of gene expression
and protein interaction data. This integrated approach incorporates
constraints from protein interaction network and
efficiently discovers patterns (groups of genes) in a bottomup
fashion from the gene-expression data. We call these
patterns active sub-network biomarkers. To illustrate the
efficacy of our proposed approach, we used four breast cancer
gene expression data sets and a human protein interaction
network and showed that active sub-network biomarkers
are more biologically plausible and genes discovered
are more reproducible across studies. Finally, through pathway
analysis, we also showed a substantial enrichment for
known cancer genes and hence were able to generate relevant
hypotheses for understanding the molecular mechanisms
of breast cancer metastasis.
2008
Hwang, TaeHyun; Tian, Ze; Kuang, Rui; Kocher, Jean-Pierre
Learning on weighted hypergraphs to integrate protein interactions and gene expressions for cancer outcome prediction Proceedings Article
In: 2008 Eighth IEEE International Conference on Data Mining, pp. 293–302, IEEE 2008, ISBN: 978-0-7695-3502-9.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Gene Expression, Protein-Protein Interaction Network, Semi-supervised Learning
@inproceedings{hwang2008learning,
title = {Learning on weighted hypergraphs to integrate protein interactions and gene expressions for cancer outcome prediction},
author = {TaeHyun Hwang and Ze Tian and Rui Kuang and Jean-Pierre Kocher},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/HyperGene.pdf},
doi = {10.1109/ICDM.2008.37},
isbn = {978-0-7695-3502-9},
year = {2008},
date = {2008-12-15},
booktitle = {2008 Eighth IEEE International Conference on Data Mining},
pages = {293--302},
organization = {IEEE},
abstract = {Abstract:
Building reliable predictive models from multiple complementary genomic data for cancer study is a crucial step towards successful cancer treatment and a full understanding of the underlying biological principles. To tackle this challenging data integration problem, we propose a hypergraph-based learning algorithm called HyperGene to integrate microarray gene expressions and protein-protein interactions for cancer outcome prediction and biomarker identification. HyperGene is a robust two-step iterative method that alternatively finds the optimal outcome prediction and the optimal weighting of the marker genes guided by a protein-protein interaction network. Under the hypothesis that cancer-related genes tend to interact with each other, the HyperGene algorithm uses a protein-protein interaction network as prior knowledge by imposing a consistent weighting of interacting genes. Our experimental results on two large-scale breast cancer gene expression datasets show that HyperGene utilizing a curated protein-protein interaction network achieves significantly improved cancer outcome prediction. Moreover, HyperGene can also retrieve many known cancer genes as highly weighted marker genes.},
keywords = {Cancer Genomics, Gene Expression, Protein-Protein Interaction Network, Semi-supervised Learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Building reliable predictive models from multiple complementary genomic data for cancer study is a crucial step towards successful cancer treatment and a full understanding of the underlying biological principles. To tackle this challenging data integration problem, we propose a hypergraph-based learning algorithm called HyperGene to integrate microarray gene expressions and protein-protein interactions for cancer outcome prediction and biomarker identification. HyperGene is a robust two-step iterative method that alternatively finds the optimal outcome prediction and the optimal weighting of the marker genes guided by a protein-protein interaction network. Under the hypothesis that cancer-related genes tend to interact with each other, the HyperGene algorithm uses a protein-protein interaction network as prior knowledge by imposing a consistent weighting of interacting genes. Our experimental results on two large-scale breast cancer gene expression datasets show that HyperGene utilizing a curated protein-protein interaction network achieves significantly improved cancer outcome prediction. Moreover, HyperGene can also retrieve many known cancer genes as highly weighted marker genes.
Hwang, TaeHyun; Kuang, Rui
A Comparative Study of Breast Cancer Microarray Gene Expression Profiles using Label Propagation Proceedings Article
In: Proceedings of the Workshop on Data Mining for Biomedical Informatics, held in conjunction with SIAM International Conference on Data Mining (SDM), 2008.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Semi-supervised Learning
@inproceedings{hwang2008comparative,
title = {A Comparative Study of Breast Cancer Microarray Gene Expression Profiles using Label Propagation},
author = {TaeHyun Hwang and Rui Kuang},
url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/SDM2008.pdf},
year = {2008},
date = {2008-04-24},
booktitle = {Proceedings of the Workshop on Data Mining for Biomedical Informatics, held in conjunction with SIAM International Conference on Data Mining (SDM)},
abstract = {A challenge in using microarray gene expression profiles to study breast cancer is to analyze the inconsistent discover-ies made from independent microarray datasets. The in-consistency is often related to the tuning of those sophis-ticated strategies needed for taking into account the depen-dence among the genes in the analysis as well as the differ-ence between the platforms and the protocols used for gen-erating the datasets. In this paper, we use a simple graph la-beling algorithm which can capture the dependency among the genes to study breast cancer microarry data. We per-form a comparative study of breast cancer metastasis on two datasets using the graph labeling algorithm and the standard statistics of correlation coefficients. We show that our algo-rithm predicts more consistent marker genes and pathways enriched by the marker genes on the two datasets than the correlation-coefficient statistics.},
keywords = {Cancer Genomics, Semi-supervised Learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Hwang, TaeHyun; Sicotte, Hugues; Tian, Ze; Wu, Baolin; Kocher, Jean-Pierre; Wigle, Dennis A; Kumar, Vipin; Kuang, Rui
Robust and efficient identification of biomarkers by classifying features on graphs Journal Article
In: Bioinformatics, vol. 24, no. 18, pp. 2023–2029, 2008, ISBN: 1460-2059.
Abstract | Links | BibTeX | Tags: Cancer Genomics, Gene Expression, Semi-supervised Learning
@article{hwang2008robustb,
title = {Robust and efficient identification of biomarkers by classifying features on graphs},
author = {Hwang, TaeHyun and Sicotte, Hugues and Tian, Ze and Wu, Baolin and Kocher, Jean-Pierre and Wigle, Dennis A and Kumar, Vipin and Kuang, Rui},
url = {http://bioinformatics.oxfordjournals.org/content/24/18/2023.short},
doi = {10.1093/bioinformatics/btn383},
isbn = {1460-2059},
year = {2008},
date = {2008-01-01},
journal = {Bioinformatics},
volume = {24},
number = {18},
pages = {2023--2029},
publisher = {Oxford Univ Press},
abstract = {Motivation: A central problem in biomarker discovery from large-scale gene expression or single nucleotide polymorphism (SNP) data is the computational challenge of taking into account the dependence among all the features. Methods that ignore the dependence usually identify non-reproducible biomarkers across independent datasets. We introduce a new graph-based semi-supervised feature classification algorithm to identify discriminative disease markers by learning on bipartite graphs. Our algorithm directly classifies the feature nodes in a bipartite graph as positive, negative or neutral with network propagation to capture the dependence among both samples and features (clinical and genetic variables) by exploring bi-cluster structures in a graph. Two features of our algorithm are: (1) our algorithm can find a global optimal labeling to capture the dependence among all the features and thus, generates highly reproducible results across independent microarray or other high-thoughput datasets, (2) our algorithm is capable of handling hundreds of thousands of features and thus, is particularly useful for biomarker identification from high-throughput gene expression and SNP data. In addition, although designed for classifying features, our algorithm can also simultaneously classify test samples for disease prognosis/diagnosis.
Results: We applied the network propagation algorithm to study three large-scale breast cancer datasets. Our algorithm achieved competitive classification performance compared with SVMs and other baseline methods, and identified several markers with clinical or biological relevance with the disease. More importantly, our algorithm also identified highly reproducible marker genes and enriched functions from the independent datasets.
Availability: Supplementary results and source code are available at http://localhost/~raphaelpetegrosso/wpcb/Feature_Class.},
keywords = {Cancer Genomics, Gene Expression, Semi-supervised Learning},
pubstate = {published},
tppubtype = {article}
}
Results: We applied the network propagation algorithm to study three large-scale breast cancer datasets. Our algorithm achieved competitive classification performance compared with SVMs and other baseline methods, and identified several markers with clinical or biological relevance with the disease. More importantly, our algorithm also identified highly reproducible marker genes and enriched functions from the independent datasets.
Availability: Supplementary results and source code are available at http://localhost/~raphaelpetegrosso/wpcb/Feature_Class.