2020 |
Sun, Jiao; Chang, Jae-Woong; Zhang, Teng; Yong, Jeongsik; Kuang, Rui; Zhang, Wei Platform-integrated mRNA Isoform Quantification Journal Article In: Bioinformatics, 36 (8), pp. 2466–2473, 2020. Links | BibTeX | Tags: Isoform Quantification @article{WeiZhang2020, title = {Platform-integrated mRNA Isoform Quantification}, author = {Jiao Sun and Jae-Woong Chang and Teng Zhang and Jeongsik Yong and Rui Kuang and Wei Zhang }, url = {https://academic.oup.com/bioinformatics/article-abstract/36/8/2466/5675495?redirectedFrom=fulltext}, year = {2020}, date = {2020-04-15}, journal = {Bioinformatics}, volume = {36}, number = {8}, pages = {2466–2473}, keywords = {Isoform Quantification}, pubstate = {published}, tppubtype = {article} } |
Petegrosso, Raphael; Song, Tianci; Kuang, Rui Hierarchical Canonical Correlation Analysis Reveals Phenotype, Genotype, and Geoclimate Associations in Plants Journal Article In: Plant Phenomics, 2020 (1969142), 2020. Links | BibTeX | Tags: Geoclimate, Phenome-genome Association @article{Petegrosso2020, title = {Hierarchical Canonical Correlation Analysis Reveals Phenotype, Genotype, and Geoclimate Associations in Plants}, author = {Raphael Petegrosso and Tianci Song and Rui Kuang}, url = {https://spj.sciencemag.org/plantphenomics/2020/1969142/cta/}, doi = {10.34133/2020/1969142}, year = {2020}, date = {2020-03-31}, journal = {Plant Phenomics}, volume = {2020}, number = {1969142}, keywords = {Geoclimate, Phenome-genome Association}, pubstate = {published}, tppubtype = {article} } |
Zhang, Wei; Petegrosso, Raphael; Chang, Jae-Woong; Sun, Jiao; Yong, Jeongsik; Chien, Jeremy; Kuang, Rui A large-scale comparative study of isoform expressions measured on four platforms Journal Article In: BMC Bioinformatics, 21 (272), 2020. Links | BibTeX | Tags: Isoform Quantification @article{nanostring, title = {A large-scale comparative study of isoform expressions measured on four platforms}, author = {Wei Zhang and Raphael Petegrosso and Jae-Woong Chang and Jiao Sun and Jeongsik Yong and Jeremy Chien and Rui Kuang}, url = {https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-020-6643-8}, year = {2020}, date = {2020-03-30}, journal = {BMC Bioinformatics}, volume = {21}, number = {272}, keywords = {Isoform Quantification}, pubstate = {published}, tppubtype = {article} } |
2019 |
Li, Zhuliu; Zhang, Wei; Huang, Stephanie R; Kuang, Rui Learning a Low-rank Tensor of Pharmacogenomic Multi-relations from Biomedical Networks Proceeding IEEE International Conference on Data Mining 2019. Abstract | Links | BibTeX | Tags: Tensor @proceedings{GTCORP2019b, title = {Learning a Low-rank Tensor of Pharmacogenomic Multi-relations from Biomedical Networks}, author = {Zhuliu Li and Wei Zhang and R Stephanie Huang and Rui Kuang}, url = {http://compbio.cs.umn.edu/08970888.pdf}, year = {2019}, date = {2019-08-31}, organization = {IEEE International Conference on Data Mining}, abstract = {Learning pharmacogenomic multi-relations among diseases, genes and chemicals from content-rich biomedical and biological networks can provide important guidance for drug discovery, drug repositioning and disease treatment. Most of the existing methods focus on imputing missing values in the diseasegene, disease-chemical and gene-chemical pairwise relations from the observed relations instead of being designed for learning high-order disease-gene-chemical multi-relations. To achieve the goal, we propose a general tensor-based optimization framework and a scalable Graph-Regularized Tensor Completion from Observed Pairwise Relations (GT-COPR) algorithm to infer the multi-relations among the entities across multiple networks in a low-rank tensor, based on manifold regularization with the graph Laplacian of a Cartesian, tensor or strong product of the networks, and consistencies between the collapsed tensors and the observed bipartite relations. Our theoretical analyses also prove the convergence and efficiency of GT-COPR. In the experiments, the tensor fiber-wise and slice-wise evaluations demonstrate the accuracy of GT-COPR for predicting the diseasegene-chemical associations across the large-scale protein-protein interactions network, chemical structural similarity network and phenotype-based human disease network; and the validation on Genomics of Drug Sensitivity in Cancer cell line dataset shows a potential clinical application of GT-COPR for learning diseasespecific chemical-gene interactions. Statistical enrichment analysis demonstrates that GT-COPR is also capable of producing both topologically and biologically relevant disease, gene and chemical components with high significance. Source code: https://github.com/kuanglab/GT-COPR}, keywords = {Tensor}, pubstate = {published}, tppubtype = {proceedings} } Learning pharmacogenomic multi-relations among diseases, genes and chemicals from content-rich biomedical and biological networks can provide important guidance for drug discovery, drug repositioning and disease treatment. Most of the existing methods focus on imputing missing values in the diseasegene, disease-chemical and gene-chemical pairwise relations from the observed relations instead of being designed for learning high-order disease-gene-chemical multi-relations. To achieve the goal, we propose a general tensor-based optimization framework and a scalable Graph-Regularized Tensor Completion from Observed Pairwise Relations (GT-COPR) algorithm to infer the multi-relations among the entities across multiple networks in a low-rank tensor, based on manifold regularization with the graph Laplacian of a Cartesian, tensor or strong product of the networks, and consistencies between the collapsed tensors and the observed bipartite relations. Our theoretical analyses also prove the convergence and efficiency of GT-COPR. In the experiments, the tensor fiber-wise and slice-wise evaluations demonstrate the accuracy of GT-COPR for predicting the diseasegene-chemical associations across the large-scale protein-protein interactions network, chemical structural similarity network and phenotype-based human disease network; and the validation on Genomics of Drug Sensitivity in Cancer cell line dataset shows a potential clinical application of GT-COPR for learning diseasespecific chemical-gene interactions. Statistical enrichment analysis demonstrates that GT-COPR is also capable of producing both topologically and biologically relevant disease, gene and chemical components with high significance. Source code: https://github.com/kuanglab/GT-COPR |
Petegrosso, Raphael; Li, Zhuliu; Kuang, Rui Machine Learning and Statistical Methods for Clustering Single-cell RNA-sequencing Data Journal Article In: Briefings in Bioinformatics, 2019. Abstract | Links | BibTeX | Tags: scRNA-Seq, scRNA-Seq Clustering @article{petegrosso2019scrnaseq, title = {Machine Learning and Statistical Methods for Clustering Single-cell RNA-sequencing Data}, author = {Raphael Petegrosso and Zhuliu Li and Rui Kuang}, url = {https://doi.org/10.1093/bib/bbz063}, year = {2019}, date = {2019-06-29}, journal = {Briefings in Bioinformatics}, abstract = {Single-cell RNAsequencing (scRNA-seq) technologies have enabled the large-scale whole-transcriptome profiling of each individual single cell in a cell population. A core analysis of the scRNA-seq transcriptome profiles is to cluster the single cells to reveal cell subtypes and infer cell lineages based on the relations among the cells. This article reviews the machine learning and statistical methods for clustering scRNA-seq transcriptomes developed in the past few years. The review focuses on how conventional clustering techniques such as hierarchical clustering, graph-based clustering, mixture models, $k$-means, ensemble learning, neural networks and density-based clustering are modified or customized to tackle the unique challenges in scRNA-seq data analysis, such as the dropout of low-expression genes, low and uneven read coverage of transcripts, highly variable total mRNAs from single cells and ambiguous cell markers in the presence of technical biases and irrelevant confounding biological variations. We review how cell-specific normalization, the imputation of dropouts and dimension reduction methods can be applied with new statistical or optimization strategies to improve the clustering of single cells. We will also introduce those more advanced approaches to cluster scRNA-seq transcriptomes in time series data and multiple cell populations and to detect rare cell types. Several software packages developed to support the cluster analysis of scRNA-seq data are also reviewed and experimentally compared to evaluate their performance and efficiency. Finally, we conclude with useful observations and possible future directions in scRNA-seq data analytics. AVAILABILITY: All the source code and data are available at https://github.com/kuanglab/single-cell-review}, keywords = {scRNA-Seq, scRNA-Seq Clustering}, pubstate = {published}, tppubtype = {article} } Single-cell RNAsequencing (scRNA-seq) technologies have enabled the large-scale whole-transcriptome profiling of each individual single cell in a cell population. A core analysis of the scRNA-seq transcriptome profiles is to cluster the single cells to reveal cell subtypes and infer cell lineages based on the relations among the cells. This article reviews the machine learning and statistical methods for clustering scRNA-seq transcriptomes developed in the past few years. The review focuses on how conventional clustering techniques such as hierarchical clustering, graph-based clustering, mixture models, $k$-means, ensemble learning, neural networks and density-based clustering are modified or customized to tackle the unique challenges in scRNA-seq data analysis, such as the dropout of low-expression genes, low and uneven read coverage of transcripts, highly variable total mRNAs from single cells and ambiguous cell markers in the presence of technical biases and irrelevant confounding biological variations. We review how cell-specific normalization, the imputation of dropouts and dimension reduction methods can be applied with new statistical or optimization strategies to improve the clustering of single cells. We will also introduce those more advanced approaches to cluster scRNA-seq transcriptomes in time series data and multiple cell populations and to detect rare cell types. Several software packages developed to support the cluster analysis of scRNA-seq data are also reviewed and experimentally compared to evaluate their performance and efficiency. Finally, we conclude with useful observations and possible future directions in scRNA-seq data analytics. AVAILABILITY: All the source code and data are available at https://github.com/kuanglab/single-cell-review |
Song, Ying; Song, Tianci; Kuang, Rui In: Transactions in GIS, 23 (3), pp. 558–578, 2019. Abstract | Links | BibTeX | Tags: Path Segmentation, Spatial Clustering @article{song2019path, title = {Path segmentation for movement trajectories with irregular sampling frequency using space-time interpolation and density-based spatial clustering}, author = {Ying Song and Tianci Song and Rui Kuang}, url = {https://doi.org/10.1111/tgis.12549}, year = {2019}, date = {2019-06-05}, journal = {Transactions in GIS}, volume = {23}, number = {3}, pages = {558--578}, abstract = {Path segmentation methods have been developed to distinguish stops and moves along movement trajectories. However, most studies do not focus on handling irregular sampling frequency of the movement data. This article proposes a four‐step method to handle various time intervals between two consecutive records, including parameter setting, space‐time interpolation, density‐based spatial clustering, and integrating the geographic context. The article uses GPS tracking data provided by HOURCAR, a non‐profit car‐sharing service in Minnesota, as a case study to demonstrate our method and present the results. We also implement the DB‐SMoT algorithm as a comparison. The results show that our four‐step method can handle various time intervals between consecutive records, group consecutive stops close to each other, and distinguish different types of stops and their inferred activities. These results can provide novel insights into car‐sharing behaviors such as trip purposes and activity scheduling.}, keywords = {Path Segmentation, Spatial Clustering}, pubstate = {published}, tppubtype = {article} } Path segmentation methods have been developed to distinguish stops and moves along movement trajectories. However, most studies do not focus on handling irregular sampling frequency of the movement data. This article proposes a four‐step method to handle various time intervals between two consecutive records, including parameter setting, space‐time interpolation, density‐based spatial clustering, and integrating the geographic context. The article uses GPS tracking data provided by HOURCAR, a non‐profit car‐sharing service in Minnesota, as a case study to demonstrate our method and present the results. We also implement the DB‐SMoT algorithm as a comparison. The results show that our four‐step method can handle various time intervals between consecutive records, group consecutive stops close to each other, and distinguish different types of stops and their inferred activities. These results can provide novel insights into car‐sharing behaviors such as trip purposes and activity scheduling. |
Petegrosso, Raphael; Li, Zhuliu; Srour, Molly A; Saad, Yousef; Zhang, Wei; Kuang, Rui Scalable Remote Homology Detection and Fold Recognition in Massive Protein Networks Journal Article In: PROTEINS: Structure, Function, and Bioinformatics, 87 (6), pp. 478-491, 2019. Abstract | Links | BibTeX | Tags: Fold Recognition, Protein Networks, Protein Remote Homology Detection @article{scalable2019petegrosso, title = {Scalable Remote Homology Detection and Fold Recognition in Massive Protein Networks}, author = {Raphael Petegrosso and Zhuliu Li and Molly A. Srour and Yousef Saad and Wei Zhang and Rui Kuang}, url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/prot.25669}, year = {2019}, date = {2019-01-31}, journal = {PROTEINS: Structure, Function, and Bioinformatics}, volume = {87}, number = {6}, pages = {478-491}, abstract = {The global connectivities in very large protein similarity networks contain traces of evolution among the proteins for detecting protein remote evolutionary relations or structural similarities. To investigate how well a protein network captures the evolutionary information, a key limitation is the intensive computation of pairwise sequence similarities needed to construct very large protein networks. In this paper, we introduce Label Propagation on Low-rank Kernel Approximation (LP-LOKA) for searching massively large protein networks. LP-LOKA propagates initial protein similarities in a low-rank graph by Nystrom approximation without computing all pairwise similarities. With scalable parallel implementations based on distributed-memory using message-passing interface and Apache-Hadoop/Spark on cloud, LP-LOKA can search protein networks with one million proteins or more. In the experiments on Swiss-Prot/ADDA/CASP data, LP-LOKA significantly improved protein ranking over the widely used HMM-HMM or profile-sequence alignment methods utilizing large protein networks. It was observed that the larger the protein similarity network, the better the performance, especially on relatively small protein superfamilies and folds. The results suggest that computing massively large protein network is necessary to meet the growing need of annotating proteins from newly sequenced species and LP-LOKA is both scalable and accurate for searching massively large protein networks.}, keywords = {Fold Recognition, Protein Networks, Protein Remote Homology Detection}, pubstate = {published}, tppubtype = {article} } The global connectivities in very large protein similarity networks contain traces of evolution among the proteins for detecting protein remote evolutionary relations or structural similarities. To investigate how well a protein network captures the evolutionary information, a key limitation is the intensive computation of pairwise sequence similarities needed to construct very large protein networks. In this paper, we introduce Label Propagation on Low-rank Kernel Approximation (LP-LOKA) for searching massively large protein networks. LP-LOKA propagates initial protein similarities in a low-rank graph by Nystrom approximation without computing all pairwise similarities. With scalable parallel implementations based on distributed-memory using message-passing interface and Apache-Hadoop/Spark on cloud, LP-LOKA can search protein networks with one million proteins or more. In the experiments on Swiss-Prot/ADDA/CASP data, LP-LOKA significantly improved protein ranking over the widely used HMM-HMM or profile-sequence alignment methods utilizing large protein networks. It was observed that the larger the protein similarity network, the better the performance, especially on relatively small protein superfamilies and folds. The results suggest that computing massively large protein network is necessary to meet the growing need of annotating proteins from newly sequenced species and LP-LOKA is both scalable and accurate for searching massively large protein networks. |
2018 |
Hauck, Amy K; Zhou, Tong; Hahn, Wendy S; Petegrosso, Raphael; Kuang, Rui; Chen, Yue; Bernlohr, David A Obesity-Induced Protein Carbonylation In Murine Adipose Tissue Regulates The DNA Binding Domain Of Nuclear Zinc-Finger Proteins Journal Article Forthcoming In: Journal of Biological Chemistry, Forthcoming. Abstract | Links | BibTeX | Tags: @article{Hauck2018, title = {Obesity-Induced Protein Carbonylation In Murine Adipose Tissue Regulates The DNA Binding Domain Of Nuclear Zinc-Finger Proteins}, author = {Amy K Hauck and Tong Zhou and Wendy S Hahn and Raphael Petegrosso and Rui Kuang and Yue Chen and David A Bernlohr}, url = {http://www.jbc.org/content/early/2018/07/16/jbc.RA118.003469.abstract}, doi = {doi: 10.1074/jbc.RA118.003469}, year = {2018}, date = {2018-07-16}, journal = {Journal of Biological Chemistry}, abstract = {In obesity-linked insulin resistance, oxidative stress in adipocytes leads to lipid peroxidation and subsequent carbonylation of proteins by diffusible lipid electrophiles. Reduction in oxidative stress attenuates protein carbonylation and insulin resistance suggesting lipid modification of proteins may play a role in metabolic disease, but the mechanisms remain incompletely understood. Herein we show that in vivo, diet-induced obesity in mice surprisingly results in preferential carbonylation of nuclear proteins by 4-hydroxy-trans 2,3 nonenal (4-HNE) or 4-hydroxy-trans 2,3 hexenal (4-HHE). Proteomic and structural analyses revealed that residues in or around the sites of zinc coordination of zinc finger proteins, such as those containing the C2H2 or MATRIN, RING, C3H1, or N4-type DNA binding domains, are particularly susceptible to carbonylation by lipid aldehydes. These observations strongly suggest that carbonylation functionally disrupts protein secondary structure supported by metal coordination. Analysis of one such target, the nuclear protein estrogen-related receptor gamma (ERR-γ), showed that ERR-γ is modified by 4-HHE in the obese state. In vitro carbonylation decreased the DNA-binding capacity of ERR-γ and correlated with the obesity-linked down regulation of many key genes promoting mitochondrial bioenergetics. Taken together, these findings reveal a novel mechanistic connection between oxidative stress and metabolic dysfunction arising from carbonylation of nuclear zinc-finger proteins such as the transcriptional regulator ERR-γ.}, keywords = {}, pubstate = {forthcoming}, tppubtype = {article} } In obesity-linked insulin resistance, oxidative stress in adipocytes leads to lipid peroxidation and subsequent carbonylation of proteins by diffusible lipid electrophiles. Reduction in oxidative stress attenuates protein carbonylation and insulin resistance suggesting lipid modification of proteins may play a role in metabolic disease, but the mechanisms remain incompletely understood. Herein we show that in vivo, diet-induced obesity in mice surprisingly results in preferential carbonylation of nuclear proteins by 4-hydroxy-trans 2,3 nonenal (4-HNE) or 4-hydroxy-trans 2,3 hexenal (4-HHE). Proteomic and structural analyses revealed that residues in or around the sites of zinc coordination of zinc finger proteins, such as those containing the C2H2 or MATRIN, RING, C3H1, or N4-type DNA binding domains, are particularly susceptible to carbonylation by lipid aldehydes. These observations strongly suggest that carbonylation functionally disrupts protein secondary structure supported by metal coordination. Analysis of one such target, the nuclear protein estrogen-related receptor gamma (ERR-γ), showed that ERR-γ is modified by 4-HHE in the obese state. In vitro carbonylation decreased the DNA-binding capacity of ERR-γ and correlated with the obesity-linked down regulation of many key genes promoting mitochondrial bioenergetics. Taken together, these findings reveal a novel mechanistic connection between oxidative stress and metabolic dysfunction arising from carbonylation of nuclear zinc-finger proteins such as the transcriptional regulator ERR-γ. |
Chang Jae-Woong; Zhang, Wei; Yeh Hsin Sung; Park Meeyeon; Yao Chengguo; Shi Yongsheng; Kuang Rui# ; Yong, Jeongsik# An integrative model for alternative polyadenylation, IntMAP, delineates mTOR-modulated endoplasmic reticulum stress response Journal Article In: Nucleic Acids Research, 46 (12), pp. P5996–6008, 2018. Abstract | BibTeX | Tags: Transcriptome @article{chang2018, title = {An integrative model for alternative polyadenylation, IntMAP, delineates mTOR-modulated endoplasmic reticulum stress response}, author = {Chang, Jae-Woong; Zhang, Wei; Yeh, Hsin Sung; Park, Meeyeon; Yao, Chengguo; Shi, Yongsheng; Kuang, Rui# and Yong, Jeongsik#}, year = {2018}, date = {2018-07-06}, journal = {Nucleic Acids Research}, volume = {46}, number = {12}, pages = {P5996–6008}, abstract = {3'-untranslated regions (UTRs) can vary through the use of alternative polyadenylation sites during pre-mRNA processing. Multiple publically available pipelines combining high profiling technologies and bioinformatics tools have been developed to catalog changes in 3'-UTR lengths. In our recent RNA-seq experiments using cells with hyper-activated mammalian target of rapamycin (mTOR), we found that cellular mTOR activation leads to transcriptome-wide alternative polyadenylation (APA), resulting in the activation of multiple cellular pathways. Here, we developed a novel bioinformatics algorithm, IntMAP, which integrates RNA-Seq and PolyA Site (PAS)-Seq data for a comprehensive characterization of APA events. By applying IntMAP to the datasets from cells with hyper-activated mTOR, we identified novel APA events that could otherwise not be identified by either profiling method alone. Several transcription factors including Cebpg (CCAAT/enhancer binding protein gamma) were among the newly discovered APA transcripts, indicating that diverse transcriptional networks may be regulated by mTOR-coordinated APA. The prevention of APA in Cebpg using the CRISPR/cas9-mediated genome editing tool showed that mTOR-driven 3'-UTR shortening in Cebpg is critical in protecting cells from endoplasmic reticulum (ER) stress. Taken together, we present IntMAP as a new bioinformatics algorithm for APA analysis by which we expand our understanding of the physiological role of mTOR-coordinated APA events to ER stress response. IntMAP toolbox is available at http://compbio.cs.umn.edu/IntMAP/.}, keywords = {Transcriptome}, pubstate = {published}, tppubtype = {article} } 3'-untranslated regions (UTRs) can vary through the use of alternative polyadenylation sites during pre-mRNA processing. Multiple publically available pipelines combining high profiling technologies and bioinformatics tools have been developed to catalog changes in 3'-UTR lengths. In our recent RNA-seq experiments using cells with hyper-activated mammalian target of rapamycin (mTOR), we found that cellular mTOR activation leads to transcriptome-wide alternative polyadenylation (APA), resulting in the activation of multiple cellular pathways. Here, we developed a novel bioinformatics algorithm, IntMAP, which integrates RNA-Seq and PolyA Site (PAS)-Seq data for a comprehensive characterization of APA events. By applying IntMAP to the datasets from cells with hyper-activated mTOR, we identified novel APA events that could otherwise not be identified by either profiling method alone. Several transcription factors including Cebpg (CCAAT/enhancer binding protein gamma) were among the newly discovered APA transcripts, indicating that diverse transcriptional networks may be regulated by mTOR-coordinated APA. The prevention of APA in Cebpg using the CRISPR/cas9-mediated genome editing tool showed that mTOR-driven 3'-UTR shortening in Cebpg is critical in protecting cells from endoplasmic reticulum (ER) stress. Taken together, we present IntMAP as a new bioinformatics algorithm for APA analysis by which we expand our understanding of the physiological role of mTOR-coordinated APA events to ER stress response. IntMAP toolbox is available at http://compbio.cs.umn.edu/IntMAP/. |
Kang, Xiaojun; Xu, Gang; Lee, Byungha; Chen, Chen; Zhang, Huanan; Kuang, Rui; Ni, Min HRB2 and BBX21 interaction modulates Arabidopsis ABI5 locus and stomatal aperture Journal Article In: Plant, Cell & Environment, (41), pp. 1912-1925, 2018. Abstract | Links | BibTeX | Tags: @article{Kang2018, title = {HRB2 and BBX21 interaction modulates Arabidopsis ABI5 locus and stomatal aperture}, author = {Xiaojun Kang and Gang Xu and Byungha Lee and Chen Chen and Huanan Zhang and Rui Kuang and Min Ni}, url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/pce.13336}, doi = {https://doi.org/10.1111/pce.13336}, year = {2018}, date = {2018-05-10}, journal = {Plant, Cell & Environment}, number = {41}, pages = {1912-1925}, abstract = {Blue light triggers the opening of stomata in the morning to allow CO2 uptake and water loss through transpiration. During the day, plants may experience periodic drought and accumulate abscisic acid (ABA). ABA antagonizes blue light signalling through phosphatidic acid and reduces stomatal aperture. This study reveals a molecular mechanism by which two light signalling proteins interact to repress ABA signalling in the control of stomatal aperture. A hypersensitive to red and blue 2 (hrb2) mutant has a defective ATP‐dependent chromatin‐remodelling factor, PKL, in the chromodomain/helicase/DNA binding family. HRB2 enhances the light‐induced expression of a B‐box transcription factor gene, BBX21. BBX21 binds a T/G box in the ABI5 promoter and recruits HRB2 to modulate the chromatin structure at the ABI5 locus. Mutation in either HRB2 or BBX21 led to reduced water loss and ABA hypersensitivity. This hypersensitivity to ABA was well explained by the enhanced expression of the ABA signalling gene ABI5 in both mutants. Indeed, stomatal aperture was significantly reduced by ABI5 overexpression in the absence or presence of ABA under monochromatic light conditions. Overall, we present a regulatory loop in which two light signalling proteins repress ABA signalling to sustain gas exchange when plants experience periodic drou}, keywords = {}, pubstate = {published}, tppubtype = {article} } Blue light triggers the opening of stomata in the morning to allow CO2 uptake and water loss through transpiration. During the day, plants may experience periodic drought and accumulate abscisic acid (ABA). ABA antagonizes blue light signalling through phosphatidic acid and reduces stomatal aperture. This study reveals a molecular mechanism by which two light signalling proteins interact to repress ABA signalling in the control of stomatal aperture. A hypersensitive to red and blue 2 (hrb2) mutant has a defective ATP‐dependent chromatin‐remodelling factor, PKL, in the chromodomain/helicase/DNA binding family. HRB2 enhances the light‐induced expression of a B‐box transcription factor gene, BBX21. BBX21 binds a T/G box in the ABI5 promoter and recruits HRB2 to modulate the chromatin structure at the ABI5 locus. Mutation in either HRB2 or BBX21 led to reduced water loss and ABA hypersensitivity. This hypersensitivity to ABA was well explained by the enhanced expression of the ABA signalling gene ABI5 in both mutants. Indeed, stomatal aperture was significantly reduced by ABI5 overexpression in the absence or presence of ABA under monochromatic light conditions. Overall, we present a regulatory loop in which two light signalling proteins repress ABA signalling to sustain gas exchange when plants experience periodic drou |
Zhang, Huanan; Lee, Catherine A A; Li, Zhuliu; Garbe, John R; Eide, Cindy R; Petegrosso, Raphael; Kuang, Rui; Tolar, Jakub A Multitask Clustering Approach for Single-cell RNA-Seq Analysis in Recessive Dystrophic Epidermolysis Bullosa Journal Article In: PLOS Computational Biology, 14 (4), 2018. Abstract | Links | BibTeX | Tags: clustering, multitask learning, scRNA-Seq @article{multitask_zhang, title = {A Multitask Clustering Approach for Single-cell RNA-Seq Analysis in Recessive Dystrophic Epidermolysis Bullosa}, author = {Huanan Zhang and Catherine A. A. Lee and Zhuliu Li and John R. Garbe and Cindy R. Eide and Raphael Petegrosso and Rui Kuang and Jakub Tolar}, url = {http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1006053}, doi = {https://doi.org/10.1371/journal.pcbi.1006053}, year = {2018}, date = {2018-04-05}, journal = {PLOS Computational Biology}, volume = {14}, number = {4}, abstract = {Single-cell RNA sequencing (scRNA-seq) has been widely applied to discover new cell types by detecting sub-populations in a heterogeneous group of cells. Since scRNA-seq experiments have lower read coverage/tag counts and introduce more technical biases compared to bulk RNA-seq experiments, the limited number of sampled cells combined with the experimental biases and other dataset specific variations presents a challenge to cross-dataset analysis and discovery of relevant biological variations across multiple cell populations. In this paper, we introduce a method of variance-driven multitask clustering of single-cell RNA-seq data (scVDMC) that utilizes multiple single-cell populations from biological replicates or different samples. scVDMC clusters single cells in multiple scRNA-seq experiments of similar cell types and markers but varying expression patterns such that the scRNA-seq data are better integrated than typical pooled analyses which only increase the sample size. By controlling the variance among the cell clusters within each dataset and across all the datasets, scVDMC detects cell sub-populations in each individual experiment with shared cell-type markers but varying cluster centers among all the experiments. Applied to two real scRNA-seq datasets with several replicates and one large-scale Drop-seq dataset on three patient samples, scVDMC more accurately detected cell populations and known cell markers than pooled clustering and other recently proposed scRNA-seq clustering methods. In the case study applied to in-house Recessive Dystrophic Epidermolysis Bullosa (RDEB) scRNA-seq data, scVDMC revealed several new cell types and unknown markers validated by flow cytometry. }, keywords = {clustering, multitask learning, scRNA-Seq}, pubstate = {published}, tppubtype = {article} } Single-cell RNA sequencing (scRNA-seq) has been widely applied to discover new cell types by detecting sub-populations in a heterogeneous group of cells. Since scRNA-seq experiments have lower read coverage/tag counts and introduce more technical biases compared to bulk RNA-seq experiments, the limited number of sampled cells combined with the experimental biases and other dataset specific variations presents a challenge to cross-dataset analysis and discovery of relevant biological variations across multiple cell populations. In this paper, we introduce a method of variance-driven multitask clustering of single-cell RNA-seq data (scVDMC) that utilizes multiple single-cell populations from biological replicates or different samples. scVDMC clusters single cells in multiple scRNA-seq experiments of similar cell types and markers but varying expression patterns such that the scRNA-seq data are better integrated than typical pooled analyses which only increase the sample size. By controlling the variance among the cell clusters within each dataset and across all the datasets, scVDMC detects cell sub-populations in each individual experiment with shared cell-type markers but varying cluster centers among all the experiments. Applied to two real scRNA-seq datasets with several replicates and one large-scale Drop-seq dataset on three patient samples, scVDMC more accurately detected cell populations and known cell markers than pooled clustering and other recently proposed scRNA-seq clustering methods. In the case study applied to in-house Recessive Dystrophic Epidermolysis Bullosa (RDEB) scRNA-seq data, scVDMC revealed several new cell types and unknown markers validated by flow cytometry. |
Xiang, Xiaoyu; Wang, Yuanguo; Zhang, Hongbin; Piao, Jinhua; Muthusamy, Selvaraj; Wang, Lei; Deng, Yibin; Zhang, Wei; Kuang, Rui; Billadeau, Daniel D; Huang, Shengbing; Lai, Jinping; Urrutia, Raul; Kang, Ningling Vasodilator-stimulated phosphoprotein promotes liver metastasis of gastrointestinal cancer by activating a β1-integrin-FAK-YAP1/TAZ signaling pathway Journal Article Forthcoming In: npj Precision Oncology, Forthcoming. Abstract | Links | BibTeX | Tags: @article{xiang2018vasodilator, title = {Vasodilator-stimulated phosphoprotein promotes liver metastasis of gastrointestinal cancer by activating a β1-integrin-FAK-YAP1/TAZ signaling pathway}, author = {Xiaoyu Xiang and Yuanguo Wang and Hongbin Zhang and Jinhua Piao and Selvaraj Muthusamy and Lei Wang and Yibin Deng and Wei Zhang and Rui Kuang and Daniel D. Billadeau and Shengbing Huang and Jinping Lai and Raul Urrutia and Ningling Kang}, url = {https://www.nature.com/articles/s41698-017-0045-7}, doi = {10.1038/s41698-017-0045-7}, year = {2018}, date = {2018-03-22}, journal = {npj Precision Oncology}, abstract = {Extracellular matrix (ECM)-induced β1-integrin-FAK signaling promotes cell attachment, survival, and migration of cancer cells in a distant organ so as to enable cancer metastasis. However, mechanisms governing activation of the β1-integrin-FAK signaling remain incompletely understood. Here, we report that vasodilator-stimulated phosphoprotein (VASP), an actin binding protein, is required for ECM–mediated β1-integrin-FAK-YAP1/TAZ signaling in gastrointestinal (GI) cancer cells and their liver metastasis. In patient-derived samples, VASP is upregulated in 53 of 63 colorectal cancers and 43 of 53 pancreatic ductal adenocarcinomas and high VASP levels correlate with liver metastasis and reduced patient survival. In a Matrigel-based 3-dimensional (3D) culture model, short hairpin RNA (shRNA)–mediated VASP knockdown in colorectal cancer cells (KM12L4, HCT116, and HT29) and pancreatic cancer cells (L3.6 and MIA PaCa-1) suppresses the growth of 3D cancer spheroids. Mechanistic studies reveal that VASP knockdown suppresses FAK phosphorylation and YAP1/TAZ protein levels, but not Akt or Erk-related pathways and that YAP1/TAZ proteins are enhanced by the β1-integrin-FAK signaling. Additionally, VASP regulates the β1-integrin-FAK-YAP1/TAZ signaling by at least two mechanisms: (1) promoting ECM-mediated β1-integrin activation and (2) regulating YAP1/TAZ dephosphorylation at downstream of RhoA to enhance the stability of YAP1/TAZ proteins. In agreement with these, preclinical studies with two experimental liver metastasis mouse models demonstrate that VASP knockdown suppresses GI cancer liver metastasis, β1-integrin activation, and YAP1/TAZ levels of metastatic cancer cells. Together, our data support VASP as a treatment target for liver metastasis of colorectal and pancreatic cancers.}, keywords = {}, pubstate = {forthcoming}, tppubtype = {article} } Extracellular matrix (ECM)-induced β1-integrin-FAK signaling promotes cell attachment, survival, and migration of cancer cells in a distant organ so as to enable cancer metastasis. However, mechanisms governing activation of the β1-integrin-FAK signaling remain incompletely understood. Here, we report that vasodilator-stimulated phosphoprotein (VASP), an actin binding protein, is required for ECM–mediated β1-integrin-FAK-YAP1/TAZ signaling in gastrointestinal (GI) cancer cells and their liver metastasis. In patient-derived samples, VASP is upregulated in 53 of 63 colorectal cancers and 43 of 53 pancreatic ductal adenocarcinomas and high VASP levels correlate with liver metastasis and reduced patient survival. In a Matrigel-based 3-dimensional (3D) culture model, short hairpin RNA (shRNA)–mediated VASP knockdown in colorectal cancer cells (KM12L4, HCT116, and HT29) and pancreatic cancer cells (L3.6 and MIA PaCa-1) suppresses the growth of 3D cancer spheroids. Mechanistic studies reveal that VASP knockdown suppresses FAK phosphorylation and YAP1/TAZ protein levels, but not Akt or Erk-related pathways and that YAP1/TAZ proteins are enhanced by the β1-integrin-FAK signaling. Additionally, VASP regulates the β1-integrin-FAK-YAP1/TAZ signaling by at least two mechanisms: (1) promoting ECM-mediated β1-integrin activation and (2) regulating YAP1/TAZ dephosphorylation at downstream of RhoA to enhance the stability of YAP1/TAZ proteins. In agreement with these, preclinical studies with two experimental liver metastasis mouse models demonstrate that VASP knockdown suppresses GI cancer liver metastasis, β1-integrin activation, and YAP1/TAZ levels of metastatic cancer cells. Together, our data support VASP as a treatment target for liver metastasis of colorectal and pancreatic cancers. |
Li, Zhuliu; Petegrosso, Raphael; Smith, Shaden; Sterling, David; Karypis, George; Kuang, Rui Scalable Label Propagation for Multi-relational Learning on Tensor Product Graph Conference arXiv, 2018. Abstract | Links | BibTeX | Tags: Network Alignment, Network-based Learning, Semi-supervised Learning @conference{li2018scalable, title = {Scalable Label Propagation for Multi-relational Learning on Tensor Product Graph }, author = {Zhuliu Li and Raphael Petegrosso and Shaden Smith and David Sterling and George Karypis and Rui Kuang}, url = {https://arxiv.org/abs/1802.07379}, year = {2018}, date = {2018-02-20}, booktitle = {arXiv}, abstract = {Label propagation on the tensor product of multiple graphs can infer multi-relations among the entities across the graphs by learning labels in a tensor. However, the tensor formulation is only empirically scalable up to three graphs due to the exponential complexity of computing tensors. In this paper, we propose an optimization formulation and a scalable Lowrank Tensor-based Label Propagation algorithm (LowrankTLP). The optimization formulation minimizes the rank-k approximation error for computing the closed-form solution of label propagation on a tensor product graph with efficient tensor computations used in LowrankTLP. LowrankTLP takes either a sparse tensor of known multi-relations or pairwise relations between each pair of graphs as the input to infer unknown multi-relations by semi-supervised learning on the tensor product graph. We also accelerate LowrankTLP with parallel tensor computation which enabled label propagation on a tensor product of 100 graphs of size 1000 within 150 seconds in simulation. LowrankTLP was also successfully applied to multi-relational learning for predicting author-paper-venue in publication records, alignment of several protein-protein interaction networks across species and alignment of segmented regions across up to 7 CT scan images. The experiments prove that LowrankTLP indeed well approximates the original label propagation with high scalability.}, keywords = {Network Alignment, Network-based Learning, Semi-supervised Learning}, pubstate = {published}, tppubtype = {conference} } Label propagation on the tensor product of multiple graphs can infer multi-relations among the entities across the graphs by learning labels in a tensor. However, the tensor formulation is only empirically scalable up to three graphs due to the exponential complexity of computing tensors. In this paper, we propose an optimization formulation and a scalable Lowrank Tensor-based Label Propagation algorithm (LowrankTLP). The optimization formulation minimizes the rank-k approximation error for computing the closed-form solution of label propagation on a tensor product graph with efficient tensor computations used in LowrankTLP. LowrankTLP takes either a sparse tensor of known multi-relations or pairwise relations between each pair of graphs as the input to infer unknown multi-relations by semi-supervised learning on the tensor product graph. We also accelerate LowrankTLP with parallel tensor computation which enabled label propagation on a tensor product of 100 graphs of size 1000 within 150 seconds in simulation. LowrankTLP was also successfully applied to multi-relational learning for predicting author-paper-venue in publication records, alignment of several protein-protein interaction networks across species and alignment of segmented regions across up to 7 CT scan images. The experiments prove that LowrankTLP indeed well approximates the original label propagation with high scalability. |
2017 |
Zhang, Huanan; Roe, David; Kuang, Rui Detecting Population-differentiation Copy Number Variants in Human Population Tree by Sparse Group Selection Journal Article Forthcoming In: IEEE/ACM Transactions on Computational Biology and Bioinformatics, 16 (2), pp. 538 - 549, Forthcoming. Abstract | Links | BibTeX | Tags: DNA Copy Number Variation, Sparse Group Learning @article{Kuang2017, title = {Detecting Population-differentiation Copy Number Variants in Human Population Tree by Sparse Group Selection}, author = { Huanan Zhang and David Roe and Rui Kuang}, url = {http://ieeexplore.ieee.org/document/8168351/}, year = {2017}, date = {2017-12-08}, journal = {IEEE/ACM Transactions on Computational Biology and Bioinformatics}, volume = {16}, number = {2}, pages = {538 - 549}, abstract = {Copy-number variants (CNVs) account for a substantial proportion of human genetic variations. Understanding the CNV diversities across populations is a computational challenge because CNV patterns are often present in several related populations and only occur in a subgroup of individuals within each of the population. This paper introduces a tree-guided sparse group selection algorithm (treeSGS) to detect population-differentiation CNV markers of subgroups across populations organized by a phylogenetic tree of human populations. The treeSGS algorithm detects CNV markers of populations associated with nodes from all levels of the tree such that the evolutionary relations among the populations are incorporated for more accurate detection of population-differentiation CNVs. We applied treeSGS algorithm to study the 1179 samples from the 11 populations in Hapmap3 CNV data. The treeSGS algorithm accurately identifies CNV markers of each population and the collection of populations organized under the branches of the human population tree, validated by consistency among family trios and SNP characterizations of the CNV regions. Further comparison between the detected CNV markers and other population-differentiation CNVs reported in 1000 genome data and other recent studies also shows that treeSGS can significantly improve the current annotations of population-differentiation CNV markers. TreeSGS package is available at http://compbio.cs.umn.edu/treesgs.}, keywords = {DNA Copy Number Variation, Sparse Group Learning}, pubstate = {forthcoming}, tppubtype = {article} } Copy-number variants (CNVs) account for a substantial proportion of human genetic variations. Understanding the CNV diversities across populations is a computational challenge because CNV patterns are often present in several related populations and only occur in a subgroup of individuals within each of the population. This paper introduces a tree-guided sparse group selection algorithm (treeSGS) to detect population-differentiation CNV markers of subgroups across populations organized by a phylogenetic tree of human populations. The treeSGS algorithm detects CNV markers of populations associated with nodes from all levels of the tree such that the evolutionary relations among the populations are incorporated for more accurate detection of population-differentiation CNVs. We applied treeSGS algorithm to study the 1179 samples from the 11 populations in Hapmap3 CNV data. The treeSGS algorithm accurately identifies CNV markers of each population and the collection of populations organized under the branches of the human population tree, validated by consistency among family trios and SNP characterizations of the CNV regions. Further comparison between the detected CNV markers and other population-differentiation CNVs reported in 1000 genome data and other recent studies also shows that treeSGS can significantly improve the current annotations of population-differentiation CNV markers. TreeSGS package is available at http://compbio.cs.umn.edu/treesgs. |
Zhang, Wei; Chien, Jeremy; Yong, Jeongsik; Kuang, Rui Network-based Machine Learning and Graph Theory Algorithms for Precision Oncology Journal Article In: NPJ Precision Oncology, (25), 2017. Abstract | Links | BibTeX | Tags: Cancer Genomics, Network-based Learning, Phenome-genome Association, Protein-Protein Interaction Network, Semi-supervised Learning @article{networkreview2017, title = {Network-based Machine Learning and Graph Theory Algorithms for Precision Oncology}, author = {Wei Zhang and Jeremy Chien and Jeongsik Yong and Rui Kuang}, url = {https://www.nature.com/articles/s41698-017-0029-7}, doi = {doi:10.1038/s41698-017-0029-7}, year = {2017}, date = {2017-08-08}, journal = {NPJ Precision Oncology}, number = {25}, abstract = {Network-based analytics plays an increasingly important role in precision oncology. Growing evidence in recent studies suggests that cancer can be better understood through mutated or dysregulated pathways or networks rather than individual mutations and that the efficacy of repositioned drugs can be inferred from disease modules in molecular networks. This article reviews network-based machine learning and graph theory algorithms for integrative analysis of personal genomic data and biomedical knowledge bases to identify tumor-specific molecular mechanisms, candidate targets and repositioned drugs for personalized treatment. The review focuses on the algorithmic design and mathematical formulation of these methods to facilitate applications and implementations of network-based analysis in the practice of precision oncology. We review the methods applied in three scenarios to integrate genomic data and network models in different analysis pipelines, and we examine three categories of network-based approaches for repositioning drugs in drug-disease-gene networks. In addition, we perform a comprehensive subnetwork/pathway analysis of mutations in 31 cancer genome projects in the Cancer Genome Atlas (TCGA) and present a detailed case study on ovarian cancer. Finally, we discuss interesting observations, potential pitfalls and future directions in network-based precision oncology.}, keywords = {Cancer Genomics, Network-based Learning, Phenome-genome Association, Protein-Protein Interaction Network, Semi-supervised Learning}, pubstate = {published}, tppubtype = {article} } Network-based analytics plays an increasingly important role in precision oncology. Growing evidence in recent studies suggests that cancer can be better understood through mutated or dysregulated pathways or networks rather than individual mutations and that the efficacy of repositioned drugs can be inferred from disease modules in molecular networks. This article reviews network-based machine learning and graph theory algorithms for integrative analysis of personal genomic data and biomedical knowledge bases to identify tumor-specific molecular mechanisms, candidate targets and repositioned drugs for personalized treatment. The review focuses on the algorithmic design and mathematical formulation of these methods to facilitate applications and implementations of network-based analysis in the practice of precision oncology. We review the methods applied in three scenarios to integrate genomic data and network models in different analysis pipelines, and we examine three categories of network-based approaches for repositioning drugs in drug-disease-gene networks. In addition, we perform a comprehensive subnetwork/pathway analysis of mutations in 31 cancer genome projects in the Cancer Genome Atlas (TCGA) and present a detailed case study on ovarian cancer. Finally, we discuss interesting observations, potential pitfalls and future directions in network-based precision oncology. |
Zhang, Huanan; Cheng, Feng; Xiao, Yuguo; Kang, Xiaojun; Wang, Xiaowu; Kuang, Rui; Ni, Min Global analysis of canola genes targeted by SHORT HYPOCOTYL UNDER BLUE 1 during endosperm and embryo development Journal Article In: The Plant Journal, 91 (1), pp. 158-171, 2017. Abstract | Links | BibTeX | Tags: @article{huananplant2016b, title = {Global analysis of canola genes targeted by SHORT HYPOCOTYL UNDER BLUE 1 during endosperm and embryo development}, author = {Huanan Zhang and Feng Cheng and Yuguo Xiao and Xiaojun Kang and Xiaowu Wang and Rui Kuang and Min Ni}, url = {http://onlinelibrary.wiley.com/doi/10.1111/tpj.13542/abstract}, year = {2017}, date = {2017-07-01}, journal = {The Plant Journal}, volume = {91}, number = {1}, pages = {158-171}, abstract = {Seed development in dicots includes early endosperm proliferation followed by growth of the embryo to replace the endosperm. Endosperm proliferation in dicots not only provides nutrient supplies for subsequent embryo development but also enforces a space limitation, influencing final seed size. Overexpression of Arabidopsis SHORT HYPOCOTYL UNDER BLUE1::uidA (SHB1:uidA) in canola produces large seeds. We performed global analysis of the canola genes that were expressed and influenced by SHB1 during early endosperm proliferation at 8 days after pollination (DAP) and late embryo development at 13 DAP. Overexpression of SHB1 altered the expression of 973 genes at 8 DAP and 1035 genes at 13 DAP. We also surveyed the global SHB1 association sites, and merging of these sites with the RNA sequencing data identified a set of canola genes targeted by SHB1. The 8-DAP list includes positive and negative genes that influence endosperm proliferation and are homologous to Arabidopsis MINI3, IKU2, SHB1, AGL62, FIE and AP2. We revealed a major role for SHB1 in canola endosperm development based on the dynamics of SHB1-altered gene expression, the magnitude of SHB1 chromatin immunoprecipitation enrichment and the over-representation of eight regulatory genes for endosperm development. Our studies focus on an important agronomic trait in a major crop for global agriculture. The datasets on stage-specific and SHB1-induced gene expression and genes targeted by SHB1 also provide a useful resource in the field of endosperm development and seed size engineering. Our practices in an allotetraploid species will impact similar studies in other crop species.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Seed development in dicots includes early endosperm proliferation followed by growth of the embryo to replace the endosperm. Endosperm proliferation in dicots not only provides nutrient supplies for subsequent embryo development but also enforces a space limitation, influencing final seed size. Overexpression of Arabidopsis SHORT HYPOCOTYL UNDER BLUE1::uidA (SHB1:uidA) in canola produces large seeds. We performed global analysis of the canola genes that were expressed and influenced by SHB1 during early endosperm proliferation at 8 days after pollination (DAP) and late embryo development at 13 DAP. Overexpression of SHB1 altered the expression of 973 genes at 8 DAP and 1035 genes at 13 DAP. We also surveyed the global SHB1 association sites, and merging of these sites with the RNA sequencing data identified a set of canola genes targeted by SHB1. The 8-DAP list includes positive and negative genes that influence endosperm proliferation and are homologous to Arabidopsis MINI3, IKU2, SHB1, AGL62, FIE and AP2. We revealed a major role for SHB1 in canola endosperm development based on the dynamics of SHB1-altered gene expression, the magnitude of SHB1 chromatin immunoprecipitation enrichment and the over-representation of eight regulatory genes for endosperm development. Our studies focus on an important agronomic trait in a major crop for global agriculture. The datasets on stage-specific and SHB1-induced gene expression and genes targeted by SHB1 also provide a useful resource in the field of endosperm development and seed size engineering. Our practices in an allotetraploid species will impact similar studies in other crop species. |
Roe, David; Vierra-Green, Cynthia; Pyo, C-W; Eng, K; Hall, R; Kuang, Rui; Spellman, Stephen; Ranade, S; Geraghty, D E; Maiers, Martin Revealing complete complex KIR haplotypes phased by long-read sequencing technology Journal Article In: Genes Immunity, 1-8 , 2017. Abstract | Links | BibTeX | Tags: KIR Haplotype Inference @article{KIR2017, title = {Revealing complete complex KIR haplotypes phased by long-read sequencing technology}, author = {David Roe and Cynthia Vierra-Green and C-W Pyo and K Eng and R Hall and Rui Kuang and Stephen Spellman and S Ranade and D E Geraghty and Martin Maiers}, url = {https://www.nature.com/gene/journal/vaop/ncurrent/full/gene201710a.html}, doi = {10.1038/gene.2017.10}, year = {2017}, date = {2017-06-01}, journal = {Genes Immunity}, volume = {1-8}, abstract = {The killer cell immunoglobulin-like receptor (KIR) region of human chromosome 19 contains up to 16 genes for natural killer (NK) cell receptors that recognize human leukocyte antigen (HLA)/peptide complexes and other ligands. The KIR proteins fulfill functional roles in infections, pregnancy, autoimmune diseases and transplantation. However, their characterization remains a constant challenge. Not only are the genes highly homologous due to their recent evolution by tandem duplications, but the region is structurally dynamic due to frequent transposon-mediated recombination. A sequencing approach that precisely captures the complexity of KIR haplotypes for functional annotation is desirable. We present a unique approach to haplotype the KIR loci using single-molecule, real-time (SMRT) sequencing. Using this method, we have—for the first time—comprehensively sequenced and phased sixteen KIR haplotypes from eight individuals without imputation. The information revealed four novel haplotype structures, a novel gene-fusion allele, novel and confirmed insertion/deletion events, a homozygous individual, and overall diversity for the structural haplotypes and their alleles. These KIR haplotypes augment our existing knowledge by providing high-quality references, evolutionary informers, and source material for imputation. The haplotype sequences and gene annotations provide alternative loci for the KIR region in the human genome reference GrCh38.p8.}, keywords = {KIR Haplotype Inference}, pubstate = {published}, tppubtype = {article} } The killer cell immunoglobulin-like receptor (KIR) region of human chromosome 19 contains up to 16 genes for natural killer (NK) cell receptors that recognize human leukocyte antigen (HLA)/peptide complexes and other ligands. The KIR proteins fulfill functional roles in infections, pregnancy, autoimmune diseases and transplantation. However, their characterization remains a constant challenge. Not only are the genes highly homologous due to their recent evolution by tandem duplications, but the region is structurally dynamic due to frequent transposon-mediated recombination. A sequencing approach that precisely captures the complexity of KIR haplotypes for functional annotation is desirable. We present a unique approach to haplotype the KIR loci using single-molecule, real-time (SMRT) sequencing. Using this method, we have—for the first time—comprehensively sequenced and phased sixteen KIR haplotypes from eight individuals without imputation. The information revealed four novel haplotype structures, a novel gene-fusion allele, novel and confirmed insertion/deletion events, a homozygous individual, and overall diversity for the structural haplotypes and their alleles. These KIR haplotypes augment our existing knowledge by providing high-quality references, evolutionary informers, and source material for imputation. The haplotype sequences and gene annotations provide alternative loci for the KIR region in the human genome reference GrCh38.p8. |
2016 |
Petegrosso, Raphael; Park, Sunho; Hwang, Tae Hyun; Kuang, Rui Transfer Learning across Ontologies for Phenome-Genome Association Prediction Journal Article In: Bioinformatics, 33 (4), pp. 529-536, 2016. Abstract | Links | BibTeX | Tags: Phenome-genome Association, Semi-supervised Learning, Transfer Learning @article{petegrosso2016transfer, title = {Transfer Learning across Ontologies for Phenome-Genome Association Prediction}, author = {Raphael Petegrosso and Sunho Park and Tae Hyun Hwang and Rui Kuang}, url = {http://bioinformatics.oxfordjournals.org/content/early/2016/10/20/bioinformatics.btw649.abstract}, doi = {10.1093/bioinformatics/btw649}, year = {2016}, date = {2016-11-23}, journal = {Bioinformatics}, volume = {33}, number = {4}, pages = {529-536}, publisher = {Oxford Univ Press}, abstract = {Motivation: To better predict and analyze gene associations with the collection of phenotypes organized in a phenotype ontology, it is crucial to effectively model the hierarchical structure among the phenotypes in the ontology and leverage the sparse known associations with additional training information. In this paper, we first introduce Dual Label Propagation (DLP) to impose consistent associations with the entire phenotype paths in predicting phenotype-gene associations in Human Phenotype Ontology (HPO). DLP is then used as the base model in a transfer learning framework (tlDLP) to incorporate functional annotations in Gene Ontology (GO). By simultaneously reconstructing GO term-gene associations and HPO phenotype-gene associations for all the genes in a protein-protein interaction network, tlDLP benefits from the enriched training associations indirectly through relation with GO terms. Results: In the experiments to predict the associations between human genes and phenotypes in HPO based on human protein-protein interaction network, both DLP and tlDLP improved the prediction of gene associations with phenotype paths in HPO in cross-validation and the prediction of the most recent associations added after the snapshot of the training data. Moreover, the transfer learning through GO term-gene associations significantly improved association predictions for the phenotypes with no more specific known associations by a large margin. Examples are also shown to demonstrate how phenotype paths in phenotype ontology and transfer learning with gene ontology can improve the predictions. Availability: Source code is available at http://localhost/~raphaelpetegrosso/wpcb/ontophenome.}, keywords = {Phenome-genome Association, Semi-supervised Learning, Transfer Learning}, pubstate = {published}, tppubtype = {article} } Motivation: To better predict and analyze gene associations with the collection of phenotypes organized in a phenotype ontology, it is crucial to effectively model the hierarchical structure among the phenotypes in the ontology and leverage the sparse known associations with additional training information. In this paper, we first introduce Dual Label Propagation (DLP) to impose consistent associations with the entire phenotype paths in predicting phenotype-gene associations in Human Phenotype Ontology (HPO). DLP is then used as the base model in a transfer learning framework (tlDLP) to incorporate functional annotations in Gene Ontology (GO). By simultaneously reconstructing GO term-gene associations and HPO phenotype-gene associations for all the genes in a protein-protein interaction network, tlDLP benefits from the enriched training associations indirectly through relation with GO terms. Results: In the experiments to predict the associations between human genes and phenotypes in HPO based on human protein-protein interaction network, both DLP and tlDLP improved the prediction of gene associations with phenotype paths in HPO in cross-validation and the prediction of the most recent associations added after the snapshot of the training data. Moreover, the transfer learning through GO term-gene associations significantly improved association predictions for the phenotypes with no more specific known associations by a large margin. Examples are also shown to demonstrate how phenotype paths in phenotype ontology and transfer learning with gene ontology can improve the predictions. Availability: Source code is available at http://localhost/~raphaelpetegrosso/wpcb/ontophenome. |
Vierra-Green, Cynthia; Roe, David; Jayaraman, Jyothi; Trowsdale, John; Traherne, James; Kuang, Rui; Spellman, Stephen; Maiers, Martin In: PloS one, 11 (10), pp. e0163973, 2016. Abstract | Links | BibTeX | Tags: KIR Haplotype Inference @article{vierra2016estimating, title = {Estimating KIR Haplotype Frequencies on a Cohort of 10,000 Individuals: A Comprehensive Study on Population Variations, Typing Resolutions, and Reference Haplotypes}, author = {Cynthia Vierra-Green and David Roe and Jyothi Jayaraman and John Trowsdale and James Traherne and Rui Kuang and Stephen Spellman and Martin Maiers}, url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0163973}, doi = {10.1371/journal.pone.0163973}, year = {2016}, date = {2016-10-10}, journal = {PloS one}, volume = {11}, number = {10}, pages = {e0163973}, publisher = {Public Library of Science}, abstract = {The killer cell immunoglobulin-like receptors (KIR) mediate human natural killer (NK) cell cytotoxicity via activating or inhibiting signals. Although informative and functional haplotype patterns have been reported, most genotyping has been performed at resolutions that are structurally ambiguous. In order to leverage structural information given low-resolution genotypes, we performed experiments to quantify the effects of population variations, reference haplotypes, and genotyping resolutions on population-level haplotype frequency estimations as well as predictions of individual haplotypes. We genotyped 10,157 unrelated individuals in 5 populations (518 African American[AFA], 258 Asian or Pacific Islander[API], 8,245 European[EUR], 1,073 Hispanic[HIS], and 63 Native American[NAM]) for KIR gene presence/absence (PA), and additionally half of the AFA samples for KIR gene copy number variation (CNV). A custom EM algorithm was used to estimate haplotype frequencies for each population by interpretation in the context of three sets of reference haplotypes. The algorithm also assigns each individual the haplotype pairs of maximum likelihood. Generally, our haplotype frequency estimates agree with similar previous publications to within <5% difference for all haplotypes. The exception is that estimates for NAM from the U.S. showed higher frequency association of cB02 with tA01 (+14%) instead of tB01 (-8.5%) compared to a previous study of NAM from south of the U.S. The higher-resolution CNV genotyping on the AFA samples allowed unambiguous haplotype-pair assignments for the majority of individuals, resulting in a 22% higher median typing resolution score (TRS), which measures likelihood of self-match in the context of population-specific haplo- and geno-types. The use of TRS to quantify reduced ambiguity with CNV data clearly revealed the few individuals with ambiguous genotypes as outliers. It is observed that typing resolution and reference haplotype set influence haplotype frequency estimates. For example, PA resolution may be used with reference haplotype sets up to the point where certain haplotypes are gene-content subsets of others. At that point, CNV must be used for all genes.}, keywords = {KIR Haplotype Inference}, pubstate = {published}, tppubtype = {article} } The killer cell immunoglobulin-like receptors (KIR) mediate human natural killer (NK) cell cytotoxicity via activating or inhibiting signals. Although informative and functional haplotype patterns have been reported, most genotyping has been performed at resolutions that are structurally ambiguous. In order to leverage structural information given low-resolution genotypes, we performed experiments to quantify the effects of population variations, reference haplotypes, and genotyping resolutions on population-level haplotype frequency estimations as well as predictions of individual haplotypes. We genotyped 10,157 unrelated individuals in 5 populations (518 African American[AFA], 258 Asian or Pacific Islander[API], 8,245 European[EUR], 1,073 Hispanic[HIS], and 63 Native American[NAM]) for KIR gene presence/absence (PA), and additionally half of the AFA samples for KIR gene copy number variation (CNV). A custom EM algorithm was used to estimate haplotype frequencies for each population by interpretation in the context of three sets of reference haplotypes. The algorithm also assigns each individual the haplotype pairs of maximum likelihood. Generally, our haplotype frequency estimates agree with similar previous publications to within <5% difference for all haplotypes. The exception is that estimates for NAM from the U.S. showed higher frequency association of cB02 with tA01 (+14%) instead of tB01 (-8.5%) compared to a previous study of NAM from south of the U.S. The higher-resolution CNV genotyping on the AFA samples allowed unambiguous haplotype-pair assignments for the majority of individuals, resulting in a 22% higher median typing resolution score (TRS), which measures likelihood of self-match in the context of population-specific haplo- and geno-types. The use of TRS to quantify reduced ambiguity with CNV data clearly revealed the few individuals with ambiguous genotypes as outliers. It is observed that typing resolution and reference haplotype set influence haplotype frequency estimates. For example, PA resolution may be used with reference haplotype sets up to the point where certain haplotypes are gene-content subsets of others. At that point, CNV must be used for all genes. |
Liang, Lining; Sun, Hao; Zhang, Wei; Zhang, Mengdan; Yang, Xiao; Kuang, Rui; Zheng, Hui Meta-Analysis of EMT Datasets Reveals Different Types of EMT. Journal Article In: PloS one, 11 (6), pp. e0156839–e0156839, 2016. Abstract | Links | BibTeX | Tags: Gene Expression, Transcriptome @article{liang2015meta, title = {Meta-Analysis of EMT Datasets Reveals Different Types of EMT.}, author = {Lining Liang and Hao Sun and Wei Zhang and Mengdan Zhang and Xiao Yang and Rui Kuang and Hui Zheng}, url = {http://journals.plos.org/plosone/article?id=10.1371%2Fjournal.pone.0156839}, doi = {10.1371/journal.pone.0156839}, year = {2016}, date = {2016-06-03}, journal = {PloS one}, volume = {11}, number = {6}, pages = {e0156839--e0156839}, abstract = {As a critical process during embryonic development, cancer progression and cell fate conversions, epithelial-mesenchymal transition (EMT) has been extensively studied over the last several decades. To further understand the nature of EMT, we performed meta-analysis of multiple microarray datasets to identify the related generic signature. In this study, 24 human and 17 mouse microarray datasets were integrated to identify conserved gene expression changes in different types of EMT. Our integrative analysis revealed that there is low agreement among the list of the identified signature genes and three other lists in previous studies. Since removing the datasets with weakly-induced EMT from the analysis did not significantly improve the overlapping in the signature-gene lists, we hypothesized the existence of different types of EMT. This hypothesis was further supported by the grouping of 74 human EMT-induction samples into five distinct clusters, and the identification of distinct pathways in these different clusters of EMT samples. The five clusters of EMT-induction samples also improves the understanding of the characteristics of different EMT types. Therefore, we concluded the existence of different types of EMT was the possible reason for its complex role in multiple biological processes.}, keywords = {Gene Expression, Transcriptome}, pubstate = {published}, tppubtype = {article} } As a critical process during embryonic development, cancer progression and cell fate conversions, epithelial-mesenchymal transition (EMT) has been extensively studied over the last several decades. To further understand the nature of EMT, we performed meta-analysis of multiple microarray datasets to identify the related generic signature. In this study, 24 human and 17 mouse microarray datasets were integrated to identify conserved gene expression changes in different types of EMT. Our integrative analysis revealed that there is low agreement among the list of the identified signature genes and three other lists in previous studies. Since removing the datasets with weakly-induced EMT from the analysis did not significantly improve the overlapping in the signature-gene lists, we hypothesized the existence of different types of EMT. This hypothesis was further supported by the grouping of 74 human EMT-induction samples into five distinct clusters, and the identification of distinct pathways in these different clusters of EMT samples. The five clusters of EMT-induction samples also improves the understanding of the characteristics of different EMT types. Therefore, we concluded the existence of different types of EMT was the possible reason for its complex role in multiple biological processes. |
2015 |
Zhang, Wei; Chang, Jae-Woong; Lin, Lilong; Minn, Kay; Wu, Baolin; Chien, Jeremy; Yong, Jeongsik; Zheng, Hui; Kuang, Rui Network-based Isoform Quantification with RNA-Seq Data for Cancer Transcriptome Analysis Journal Article In: PLoS Computational Biology, e1004465 , 2015. Abstract | Links | BibTeX | Tags: Cancer Genomics, Isoform Quantification, Network-based Learning, RNA-Seq @article{Net-RSTQ, title = {Network-based Isoform Quantification with RNA-Seq Data for Cancer Transcriptome Analysis}, author = {Wei Zhang and Jae-Woong Chang and Lilong Lin and Kay Minn and Baolin Wu and Jeremy Chien and Jeongsik Yong and Hui Zheng and Rui Kuang}, url = {http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004465}, doi = {http://dx.doi.org/10.1371/journal.pcbi.1004465}, year = {2015}, date = {2015-12-23}, journal = {PLoS Computational Biology}, volume = {e1004465}, abstract = {New sequencing technologies for transcriptome-wide profiling of RNAs have greatly promoted the interest in isoform-based functional characterizations of a cellular system. Elucidation of gene expressions at the isoform resolution could lead to new molecular mechanisms such as gene-regulations and alternative splicings, and potentially better molecular signals for phenotype predictions. However, it could be overly optimistic to derive the proportion of the isoforms of a gene solely based on short read alignments. Inherently, systematical sampling biases from RNA library preparation and ambiguity of read origins in overlapping isoforms pose a problem in reliability. The work in this paper exams the possibility of using protein domain-domain interactions as prior knowledge in isoform transcript quantification. We first made the observation that protein domain-domain interactions positively correlate with isoform co-expressions in TCGA data and then designed a probabilistic EM approach to integrate domain-domain interactions with short read alignments for estimation of isoform proportions. Validated by qRT-PCR experiments on three cell lines, simulations and classifications of TCGA patient samples in several cancer types, Net-RSTQ is proven a useful tool for isoform-based analysis in functional genomes and systems biology.}, keywords = {Cancer Genomics, Isoform Quantification, Network-based Learning, RNA-Seq}, pubstate = {published}, tppubtype = {article} } New sequencing technologies for transcriptome-wide profiling of RNAs have greatly promoted the interest in isoform-based functional characterizations of a cellular system. Elucidation of gene expressions at the isoform resolution could lead to new molecular mechanisms such as gene-regulations and alternative splicings, and potentially better molecular signals for phenotype predictions. However, it could be overly optimistic to derive the proportion of the isoforms of a gene solely based on short read alignments. Inherently, systematical sampling biases from RNA library preparation and ambiguity of read origins in overlapping isoforms pose a problem in reliability. The work in this paper exams the possibility of using protein domain-domain interactions as prior knowledge in isoform transcript quantification. We first made the observation that protein domain-domain interactions positively correlate with isoform co-expressions in TCGA data and then designed a probabilistic EM approach to integrate domain-domain interactions with short read alignments for estimation of isoform proportions. Validated by qRT-PCR experiments on three cell lines, simulations and classifications of TCGA patient samples in several cancer types, Net-RSTQ is proven a useful tool for isoform-based analysis in functional genomes and systems biology. |
Chang, Jae-Woong; Zhang, Wei; Yeh, Hsin-Sung; de Jong, Ebbing P; Jun, Semo; Kim, Kwan-Hyun; Bae, Sun S; Beckman, Kenneth; Hwang, Tae Hyun; Kim, Kye-Seong; others, mRNA 3'-UTR shortening is a molecular signature of mTORC1 activation Journal Article In: Nature communications, 6 , 2015. Abstract | Links | BibTeX | Tags: @article{chang2015mrna, title = {mRNA 3'-UTR shortening is a molecular signature of mTORC1 activation}, author = {Jae-Woong Chang and Wei Zhang and Hsin-Sung Yeh and Ebbing P de Jong and Semo Jun and Kwan-Hyun Kim and Sun S Bae and Kenneth Beckman and Tae Hyun Hwang and Kye-Seong Kim and others}, url = {http://www.nature.com/articles/ncomms8218}, doi = {10.1038/ncomms8218}, year = {2015}, date = {2015-06-15}, journal = {Nature communications}, volume = {6}, publisher = {Nature Publishing Group}, abstract = {Mammalian target of rapamycin (mTOR) enhances translation from a subset of messenger RNAs containing distinct 5′-untranslated region (UTR) sequence features. Here we identify 3′-UTR shortening of mRNAs as an additional molecular signature of mTOR activation and show that 3′-UTR shortening enhances the translation of specific mRNAs. Using genetic or chemical modulations of mTOR activity in cells or mouse tissues, we show that cellular mTOR activity is crucial for 3′-UTR shortening. Although long 3′-UTR-containing transcripts minimally contribute to translation, 3-′UTR-shortened transcripts efficiently form polysomes in the mTOR-activated cells, leading to increased protein production. Strikingly, selected E2 and E3 components of ubiquitin ligase complexes are enriched by this mechanism, resulting in elevated levels of protein ubiquitination on mTOR activation. Together, these findings identify a previously uncharacterized role for mTOR in the selective regulation of protein synthesis by modulating 3′-UTR length of mRNAs.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Mammalian target of rapamycin (mTOR) enhances translation from a subset of messenger RNAs containing distinct 5′-untranslated region (UTR) sequence features. Here we identify 3′-UTR shortening of mRNAs as an additional molecular signature of mTOR activation and show that 3′-UTR shortening enhances the translation of specific mRNAs. Using genetic or chemical modulations of mTOR activity in cells or mouse tissues, we show that cellular mTOR activity is crucial for 3′-UTR shortening. Although long 3′-UTR-containing transcripts minimally contribute to translation, 3-′UTR-shortened transcripts efficiently form polysomes in the mTOR-activated cells, leading to increased protein production. Strikingly, selected E2 and E3 components of ubiquitin ligase complexes are enriched by this mechanism, resulting in elevated levels of protein ubiquitination on mTOR activation. Together, these findings identify a previously uncharacterized role for mTOR in the selective regulation of protein synthesis by modulating 3′-UTR length of mRNAs. |
Xie, MaoQiang; Xu, YingJie; Zhang, YaoGong; Hwang, TaeHyun; Kuang, Rui Network-based Phenome-Genome Association Prediction by Bi-Random Walk Journal Article In: PloS one, 10 (5), pp. e0125138, 2015. Abstract | Links | BibTeX | Tags: Phenome-genome Association, Semi-supervised Learning @article{xie2015network, title = {Network-based Phenome-Genome Association Prediction by Bi-Random Walk}, author = {MaoQiang Xie and YingJie Xu and YaoGong Zhang and TaeHyun Hwang and Rui Kuang}, url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0125138}, doi = {10.1371/journal.pone.0125138}, year = {2015}, date = {2015-05-01}, journal = {PloS one}, volume = {10}, number = {5}, pages = {e0125138}, publisher = {Public Library of Science}, abstract = {The availability of ontologies and systematic documentations of phenotypes and their genetic associations has enabled large-scale network-based global analyses of the association between the complete collection of phenotypes (phenome) and genes. To provide a fundamental understanding of how the network information is relevant to phenotype-gene associations, we analyze the circular bigraphs (CBGs) in OMIM human disease phenotype-gene association network and MGI mouse phentoype-gene association network, and introduce a bi-random walk (BiRW) algorithm to capture the CBG patterns in the networks for unveiling human and mouse phenome-genome association. BiRW performs separate random walk simultaneously on gene interaction network and phenotype similarity network to explore gene paths and phenotype paths in CBGs of different sizes to summarize their associations as predictions.}, keywords = {Phenome-genome Association, Semi-supervised Learning}, pubstate = {published}, tppubtype = {article} } The availability of ontologies and systematic documentations of phenotypes and their genetic associations has enabled large-scale network-based global analyses of the association between the complete collection of phenotypes (phenome) and genes. To provide a fundamental understanding of how the network information is relevant to phenotype-gene associations, we analyze the circular bigraphs (CBGs) in OMIM human disease phenotype-gene association network and MGI mouse phentoype-gene association network, and introduce a bi-random walk (BiRW) algorithm to capture the CBG patterns in the networks for unveiling human and mouse phenome-genome association. BiRW performs separate random walk simultaneously on gene interaction network and phenotype similarity network to explore gene paths and phenotype paths in CBGs of different sizes to summarize their associations as predictions. |
Chien, Jeremy; Sicotte, Hugues; Fan, Jian-Bing; Humphray, Sean; Cunningham, Julie M; Kalli, Kimberly R; Oberg, Ann L; Hart, Steven N; Li, Ying; Davila, Jaime I; others, TP53 mutations, tetraploidy and homologous recombination repair defects in early stage high-grade serous ovarian cancer Journal Article In: Nucleic acids research, pp. gkv111, 2015. Abstract | Links | BibTeX | Tags: Cancer Genomics @article{chien2015tp53, title = {TP53 mutations, tetraploidy and homologous recombination repair defects in early stage high-grade serous ovarian cancer}, author = {Jeremy Chien and Hugues Sicotte and Jian-Bing Fan and Sean Humphray and Julie M Cunningham and Kimberly R Kalli and Ann L Oberg and Steven N Hart and Ying Li and Jaime I Davila and others}, url = {http://nar.oxfordjournals.org/content/43/14/6945}, doi = {10.1093/nar/gkv111}, year = {2015}, date = {2015-02-02}, journal = {Nucleic acids research}, pages = {gkv111}, publisher = {Oxford Univ Press}, abstract = {To determine early somatic changes in high-grade serous ovarian cancer (HGSOC), we performed whole genome sequencing on a rare collection of 16 low stage HGSOCs. The majority showed extensive structural alterations (one had an ultramutated profile), exhibited high levels of p53 immunoreactivity, and harboured TP53 mutation, deletion or inactivation. BRCA1 and BRCA2 mutations were observed in two tumors, with nine showing evidence of a homologous recombination (HR) defect. Combined analysis with The Cancer Genome Atlas indicated that low and late stage HGSOCs have similar mutation and copy number profiles. We also found evidence that deleterious TP53 mutations are the earliest events, followed by deletions or loss of heterozygosity (LOH) of chromosomes carrying TP53, BRCA1 or BRCA2. Inactivation of HR appears to be an early event, as 62.5% of tumours showed a LOH pattern suggestive of HR defects. Three tumours with the highest ploidy had little genome-wide LOH, yet one of these had a homozygous somatic frame-shift BRCA2 mutation, suggesting that some carcinomas begin as tetraploid then descend into diploidy accompanied by genome-wide LOH. Lastly, we found evidence that structural variants (SV) cluster in HGSOC, but are absent in one ultramutated tumor, providing insights into the pathogenesis of low stage HGSOC.}, keywords = {Cancer Genomics}, pubstate = {published}, tppubtype = {article} } To determine early somatic changes in high-grade serous ovarian cancer (HGSOC), we performed whole genome sequencing on a rare collection of 16 low stage HGSOCs. The majority showed extensive structural alterations (one had an ultramutated profile), exhibited high levels of p53 immunoreactivity, and harboured TP53 mutation, deletion or inactivation. BRCA1 and BRCA2 mutations were observed in two tumors, with nine showing evidence of a homologous recombination (HR) defect. Combined analysis with The Cancer Genome Atlas indicated that low and late stage HGSOCs have similar mutation and copy number profiles. We also found evidence that deleterious TP53 mutations are the earliest events, followed by deletions or loss of heterozygosity (LOH) of chromosomes carrying TP53, BRCA1 or BRCA2. Inactivation of HR appears to be an early event, as 62.5% of tumours showed a LOH pattern suggestive of HR defects. Three tumours with the highest ploidy had little genome-wide LOH, yet one of these had a homozygous somatic frame-shift BRCA2 mutation, suggesting that some carcinomas begin as tetraploid then descend into diploidy accompanied by genome-wide LOH. Lastly, we found evidence that structural variants (SV) cluster in HGSOC, but are absent in one ultramutated tumor, providing insights into the pathogenesis of low stage HGSOC. |
Johnson, Nicholas; Zhang, Huanan; Fang, Gang; Kumar, Vipin; Kuang, Rui SubPatCNV: approximate subspace pattern mining for mapping copy-number variations Journal Article In: BMC bioinformatics, 16 (1), pp. 1, 2015, ISSN: 1471-2105. Abstract | Links | BibTeX | Tags: Cancer Genomics, DNA Copy Number Variation @article{johnson2015subpatcnv, title = {SubPatCNV: approximate subspace pattern mining for mapping copy-number variations}, author = {Nicholas Johnson and Huanan Zhang and Gang Fang and Vipin Kumar and Rui Kuang}, url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-014-0426-7}, doi = {10.1186/s12859-014-0426-7}, issn = {1471-2105}, year = {2015}, date = {2015-01-16}, journal = {BMC bioinformatics}, volume = {16}, number = {1}, pages = {1}, publisher = {BioMed Central}, abstract = {Background Many DNA copy-number variations (CNVs) are known to lead to phenotypic variations and pathogenesis. While CNVs are often only common in a small number of samples in the studied population or patient cohort, previous work has not focused on customized identification of CNV regions that only exhibit in subsets of samples with advanced data mining techniques to reliably answer questions such as “Which are all the chromosomal fragments showing nearly identical deletions or insertions in more than 30% of the individuals?”. Results We introduce a tool for mining CNV subspace patterns, namely SubPatCNV, which is capable of identifying all aberrant CNV regions specific to arbitrary sample subsets larger than a support threshold. By design, SubPatCNV is the implementation of a variation of approximate association pattern mining algorithm under a spatial constraint on the positional CNV probe features. In benchmark test, SubPatCNV was applied to identify population specific germline CNVs from four populations of HapMap samples. In experiments on the TCGA ovarian cancer dataset, SubPatCNV discovered many large aberrant CNV events in patient subgroups, and reported regions enriched with cancer relevant genes. In both HapMap data and TCGA data, it was observed that SubPatCNV employs approximate pattern mining to more effectively identify CNV subspace patterns that are consistent within a subgroup from high-density array data. Conclusions SubPatCNV available through http://sourceforge.net/projects/subpatcnv/is a unique scalable open-source software tool that provides the flexibility of identifying CNV regions specific to sample subgroups of different sizes from high-density CNV array data.}, keywords = {Cancer Genomics, DNA Copy Number Variation}, pubstate = {published}, tppubtype = {article} } Background Many DNA copy-number variations (CNVs) are known to lead to phenotypic variations and pathogenesis. While CNVs are often only common in a small number of samples in the studied population or patient cohort, previous work has not focused on customized identification of CNV regions that only exhibit in subsets of samples with advanced data mining techniques to reliably answer questions such as “Which are all the chromosomal fragments showing nearly identical deletions or insertions in more than 30% of the individuals?”. Results We introduce a tool for mining CNV subspace patterns, namely SubPatCNV, which is capable of identifying all aberrant CNV regions specific to arbitrary sample subsets larger than a support threshold. By design, SubPatCNV is the implementation of a variation of approximate association pattern mining algorithm under a spatial constraint on the positional CNV probe features. In benchmark test, SubPatCNV was applied to identify population specific germline CNVs from four populations of HapMap samples. In experiments on the TCGA ovarian cancer dataset, SubPatCNV discovered many large aberrant CNV events in patient subgroups, and reported regions enriched with cancer relevant genes. In both HapMap data and TCGA data, it was observed that SubPatCNV employs approximate pattern mining to more effectively identify CNV subspace patterns that are consistent within a subgroup from high-density array data. Conclusions SubPatCNV available through http://sourceforge.net/projects/subpatcnv/is a unique scalable open-source software tool that provides the flexibility of identifying CNV regions specific to sample subgroups of different sizes from high-density CNV array data. |
Cai, Hong ; Lilburn, Timothy G; Hong, Changjin ; Gu, Jianying ; Kuang, Rui ; Wang, Yufeng Predicting and exploring network components involved in pathogenesis in the malaria parasite via novel subnetwork alignments Journal Article In: BMC systems biology, 9 (4), pp. 1, 2015. BibTeX | Tags: Network Alignment, Protein-Protein Interaction Network @article{cai2015predicting, title = {Predicting and exploring network components involved in pathogenesis in the malaria parasite via novel subnetwork alignments}, author = {Cai, Hong and Lilburn, Timothy G and Hong, Changjin and Gu, Jianying and Kuang, Rui and Wang, Yufeng}, year = {2015}, date = {2015-01-01}, journal = {BMC systems biology}, volume = {9}, number = {4}, pages = {1}, publisher = {BioMed Central}, keywords = {Network Alignment, Protein-Protein Interaction Network}, pubstate = {published}, tppubtype = {article} } |
Sharma, Ankit; Kuang, Rui; Srivastava, Jaideep; Feng, Xiaodong; Singhal, Kartik Predicting small group accretion in social networks: A topology based incremental approach Inproceedings In: 2015 IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining (ASONAM), pp. 408–415, IEEE 2015, ISBN: 978-1-4503-3854-7/15/08. Abstract | Links | BibTeX | Tags: Semi-supervised Learning @inproceedings{sharma2015predicting, title = {Predicting small group accretion in social networks: A topology based incremental approach}, author = {Ankit Sharma and Rui Kuang and Jaideep Srivastava and Xiaodong Feng and Kartik Singhal}, url = {http://delivery.acm.org/10.1145/2810000/2808914/p408_sharma.pdf}, doi = {http://dx.doi.org/10.1145/2808797.2808914}, isbn = {978-1-4503-3854-7/15/08}, year = {2015}, date = {2015-01-01}, booktitle = {2015 IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining (ASONAM)}, pages = {408--415}, organization = {IEEE}, abstract = {Small Group evolution has been of central importance in social sciences and also in the industry for understanding dynamics of team formation. While most of research works studying groups deal at a macro level with evolution of arbitrary size communities, in this paper we restrict ourselves to studying evolution of small group (size ≤ 20) which is governed by contrasting sociological phenomenon. Given a previous history of group collaboration between a set of actors, we address the problem of predicting likely future group collaborations. Unfortunately, predicting groups requires choosing from (n choose r) possibilities (where r is group size and n is total number of actors), which becomes computationally intractable as group size increases. However, our statistical analysis of a real world dataset has shown that two processes: an external actor joining an existing group (incremental accretion (IA)) or collaborating with a subset of actors of an exiting group (subgroup accretion (SA)), are largely responsible for future group formation. This helps to drastically reduce the (n choose r) possibilities. We therefore, model the attachment of a group for different actors outside this group. In this paper, we have built three topology based prediction models to study these phenomena. The performance of these models is evaluated using extensive experiments over DBLP dataset. Our prediction results shows that the proposed models are significantly useful for future group predictions both for IA and SA.}, keywords = {Semi-supervised Learning}, pubstate = {published}, tppubtype = {inproceedings} } Small Group evolution has been of central importance in social sciences and also in the industry for understanding dynamics of team formation. While most of research works studying groups deal at a macro level with evolution of arbitrary size communities, in this paper we restrict ourselves to studying evolution of small group (size ≤ 20) which is governed by contrasting sociological phenomenon. Given a previous history of group collaboration between a set of actors, we address the problem of predicting likely future group collaborations. Unfortunately, predicting groups requires choosing from (n choose r) possibilities (where r is group size and n is total number of actors), which becomes computationally intractable as group size increases. However, our statistical analysis of a real world dataset has shown that two processes: an external actor joining an existing group (incremental accretion (IA)) or collaborating with a subset of actors of an exiting group (subgroup accretion (SA)), are largely responsible for future group formation. This helps to drastically reduce the (n choose r) possibilities. We therefore, model the attachment of a group for different actors outside this group. In this paper, we have built three topology based prediction models to study these phenomena. The performance of these models is evaluated using extensive experiments over DBLP dataset. Our prediction results shows that the proposed models are significantly useful for future group predictions both for IA and SA. |
2013 |
Zhang, Huanan; Tian, Ze; Kuang, Rui Transfer learning across cancers on DNA copy number variation analysis Inproceedings In: 2013 IEEE 13th International Conference on Data Mining, pp. 1283–1288, IEEE IEEE, 2013, ISBN: 978-0-7695-5108-1. Abstract | Links | BibTeX | Tags: Cancer Genomics, DNA Copy Number Variation, Transfer Learning @inproceedings{zhang2013transfer, title = {Transfer learning across cancers on DNA copy number variation analysis}, author = {Huanan Zhang and Ze Tian and Rui Kuang}, url = {http://compbio.cs.umn.edu/wp-content/uploads/2017/10/TLFL-10Page.pdf}, doi = {10.1109/ICDM.2013.58}, isbn = {978-0-7695-5108-1}, year = {2013}, date = {2013-12-07}, booktitle = {2013 IEEE 13th International Conference on Data Mining}, pages = {1283--1288}, publisher = {IEEE}, organization = {IEEE}, abstract = {Abstract: DNA copy number variations (CNVs) are prevalent in all types of tumors. It is still a challenge to study how CNVs play a role in driving tumorgenic mechanisms that are either universal or specific in different cancer types. To address the problem, we introduce a transfer learning framework to discover common CNVs shared across different tumor types as well as CNVs specific to each tumor type from genome-wide CNV data measured by array CGH and SNP genotyping array. The proposed model, namely Transfer Learning with Fused LASSO (TLFL), detects latent CNV components from multiple CNV datasets of different tumor types to distinguish the CNVs that are common across the datasets and those that are specific in each dataset. Both the common and type-specific CNVs are detected as latent components in matrix factorization coupled with fused LASSO on adjacent CNV probe features. TLFL considers the common latent components underlying the multiple datasets to transfer knowledge across different tumor types. In simulations and experiments on real cancer CNV datasets, TLFL detected better latent components that can be used as features to improve classification of patient samples in each individual dataset compared with the model without the knowledge transfer. In cross-dataset analysis on bladder cancer and cross-domain analysis on breast cancer and ovarian cancer, TLFL also learned latent CNV components that are both predictive of tumor stages and correlate with known cancer genes.}, keywords = {Cancer Genomics, DNA Copy Number Variation, Transfer Learning}, pubstate = {published}, tppubtype = {inproceedings} } Abstract: DNA copy number variations (CNVs) are prevalent in all types of tumors. It is still a challenge to study how CNVs play a role in driving tumorgenic mechanisms that are either universal or specific in different cancer types. To address the problem, we introduce a transfer learning framework to discover common CNVs shared across different tumor types as well as CNVs specific to each tumor type from genome-wide CNV data measured by array CGH and SNP genotyping array. The proposed model, namely Transfer Learning with Fused LASSO (TLFL), detects latent CNV components from multiple CNV datasets of different tumor types to distinguish the CNVs that are common across the datasets and those that are specific in each dataset. Both the common and type-specific CNVs are detected as latent components in matrix factorization coupled with fused LASSO on adjacent CNV probe features. TLFL considers the common latent components underlying the multiple datasets to transfer knowledge across different tumor types. In simulations and experiments on real cancer CNV datasets, TLFL detected better latent components that can be used as features to improve classification of patient samples in each individual dataset compared with the model without the knowledge transfer. In cross-dataset analysis on bladder cancer and cross-domain analysis on breast cancer and ovarian cancer, TLFL also learned latent CNV components that are both predictive of tumor stages and correlate with known cancer genes. |
Cai, Hong; Hong, Changjin; Lilburn, Timothy G; Rodriguez, Armando L; Chen, Sheng; Gu, Jianying; Kuang, Rui; Wang, Yufeng A novel subnetwork alignment approach predicts new components of the cell cycle regulatory apparatus in Plasmodium falciparum Journal Article In: BMC bioinformatics, 14 (12), pp. 1, 2013, ISSN: 1471-2105. Abstract | Links | BibTeX | Tags: Network Alignment @article{cai2013novel, title = {A novel subnetwork alignment approach predicts new components of the cell cycle regulatory apparatus in Plasmodium falciparum}, author = {Hong Cai and Changjin Hong and Timothy G Lilburn and Armando L Rodriguez and Sheng Chen and Jianying Gu and Rui Kuang and Yufeng Wang}, url = {http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-14-S12-S2}, doi = {10.1186/1471-2105-14-S12-S2}, issn = {1471-2105}, year = {2013}, date = {2013-09-24}, journal = {BMC bioinformatics}, volume = {14}, number = {12}, pages = {1}, publisher = {BioMed Central}, abstract = {Background According to the World Health organization, half the world's population is at risk of contracting malaria. They estimated that in 2010 there were 219 million cases of malaria, resulting in 660,000 deaths and an enormous economic burden on the countries where malaria is endemic. The adoption of various high-throughput genomics-based techniques by malaria researchers has meant that new avenues to the study of this disease are being explored and new targets for controlling the disease are being developed. Here, we apply a novel neighborhood subnetwork alignment approach to identify the interacting elements that help regulate the cell cycle of the malaria parasite Plasmodium falciparum. Results Our novel subnetwork alignment approach was used to compare networks in Escherichia coli and P. falciparum. Some 574 P. falciparum proteins were revealed as functional orthologs of known cell cycle proteins in E. coli. Over one third of these predicted functional orthologs were annotated as "conserved Plasmodium proteins" or "putative uncharacterized proteins" of unknown function. The predicted functionalities included cyclins, kinases, surface antigens, transcriptional regulators and various functions related to DNA replication, repair and cell division. Conclusions The results of our analysis demonstrate the power of our subnetwork alignment approach to assign functionality to previously unannotated proteins. Here, the focus was on proteins involved in cell cycle regulation. These proteins are involved in the control of diverse aspects of the parasite lifecycle and of important aspects of pathogenesis.}, keywords = {Network Alignment}, pubstate = {published}, tppubtype = {article} } Background According to the World Health organization, half the world's population is at risk of contracting malaria. They estimated that in 2010 there were 219 million cases of malaria, resulting in 660,000 deaths and an enormous economic burden on the countries where malaria is endemic. The adoption of various high-throughput genomics-based techniques by malaria researchers has meant that new avenues to the study of this disease are being explored and new targets for controlling the disease are being developed. Here, we apply a novel neighborhood subnetwork alignment approach to identify the interacting elements that help regulate the cell cycle of the malaria parasite Plasmodium falciparum. Results Our novel subnetwork alignment approach was used to compare networks in Escherichia coli and P. falciparum. Some 574 P. falciparum proteins were revealed as functional orthologs of known cell cycle proteins in E. coli. Over one third of these predicted functional orthologs were annotated as "conserved Plasmodium proteins" or "putative uncharacterized proteins" of unknown function. The predicted functionalities included cyclins, kinases, surface antigens, transcriptional regulators and various functions related to DNA replication, repair and cell division. Conclusions The results of our analysis demonstrate the power of our subnetwork alignment approach to assign functionality to previously unannotated proteins. Here, the focus was on proteins involved in cell cycle regulation. These proteins are involved in the control of diverse aspects of the parasite lifecycle and of important aspects of pathogenesis. |
Chien, Jeremy; Kuang, Rui; Landen, Charles; Shridhar, Viji Platinum-sensitive recurrence in ovarian cancer: the role of tumor microenvironment Journal Article In: Frontiers in oncology, 3 , pp. 251, 2013. Abstract | Links | BibTeX | Tags: Cancer Genomics @article{chien2013platinumb, title = {Platinum-sensitive recurrence in ovarian cancer: the role of tumor microenvironment}, author = {Jeremy Chien and Rui Kuang and Charles Landen and Viji Shridhar}, url = {http://journal.frontiersin.org/article/10.3389/fonc.2013.00251/full}, doi = {10.3389/fonc.2013.00251}, year = {2013}, date = {2013-09-23}, journal = {Frontiers in oncology}, volume = {3}, pages = {251}, publisher = {Frontiers}, abstract = {Despite several advances in the understanding of ovarian cancer pathobiology, in terms of driver genetic alterations in high-grade serous cancer, histologic heterogeneity of epithelial ovarian cancer, cell-of-origin for ovarian cancer, the survival rate from ovarian cancer is disappointingly low when compared to that of breast or prostate cancer. One of the factors contributing to the poor survival rate from ovarian cancer is the development of chemotherapy resistance following several rounds of chemotherapy. Although unicellular drug resistance mechanisms contribute to chemotherapy resistance, tumor microenvironment and the extracellular matrix (ECM), in particular, is emerging as a significant determinant of a tumor’s response to chemotherapy. In this review, we discuss the potential role of the tumor microenvironment in ovarian cancer recurrence and resistance to chemotherapy. Finally, we propose an alternative view of platinum-sensitive recurrence to describe a potential role of the ECM in the process.}, keywords = {Cancer Genomics}, pubstate = {published}, tppubtype = {article} } Despite several advances in the understanding of ovarian cancer pathobiology, in terms of driver genetic alterations in high-grade serous cancer, histologic heterogeneity of epithelial ovarian cancer, cell-of-origin for ovarian cancer, the survival rate from ovarian cancer is disappointingly low when compared to that of breast or prostate cancer. One of the factors contributing to the poor survival rate from ovarian cancer is the development of chemotherapy resistance following several rounds of chemotherapy. Although unicellular drug resistance mechanisms contribute to chemotherapy resistance, tumor microenvironment and the extracellular matrix (ECM), in particular, is emerging as a significant determinant of a tumor’s response to chemotherapy. In this review, we discuss the potential role of the tumor microenvironment in ovarian cancer recurrence and resistance to chemotherapy. Finally, we propose an alternative view of platinum-sensitive recurrence to describe a potential role of the ECM in the process. |
Publications
2020 |
Platform-integrated mRNA Isoform Quantification Journal Article In: Bioinformatics, 36 (8), pp. 2466–2473, 2020. |
Hierarchical Canonical Correlation Analysis Reveals Phenotype, Genotype, and Geoclimate Associations in Plants Journal Article In: Plant Phenomics, 2020 (1969142), 2020. |
A large-scale comparative study of isoform expressions measured on four platforms Journal Article In: BMC Bioinformatics, 21 (272), 2020. |
2019 |
Learning a Low-rank Tensor of Pharmacogenomic Multi-relations from Biomedical Networks Proceeding IEEE International Conference on Data Mining 2019. |
Machine Learning and Statistical Methods for Clustering Single-cell RNA-sequencing Data Journal Article In: Briefings in Bioinformatics, 2019. |
In: Transactions in GIS, 23 (3), pp. 558–578, 2019. |
Scalable Remote Homology Detection and Fold Recognition in Massive Protein Networks Journal Article In: PROTEINS: Structure, Function, and Bioinformatics, 87 (6), pp. 478-491, 2019. |
2018 |
Obesity-Induced Protein Carbonylation In Murine Adipose Tissue Regulates The DNA Binding Domain Of Nuclear Zinc-Finger Proteins Journal Article Forthcoming In: Journal of Biological Chemistry, Forthcoming. |
An integrative model for alternative polyadenylation, IntMAP, delineates mTOR-modulated endoplasmic reticulum stress response Journal Article In: Nucleic Acids Research, 46 (12), pp. P5996–6008, 2018. |
HRB2 and BBX21 interaction modulates Arabidopsis ABI5 locus and stomatal aperture Journal Article In: Plant, Cell & Environment, (41), pp. 1912-1925, 2018. |
A Multitask Clustering Approach for Single-cell RNA-Seq Analysis in Recessive Dystrophic Epidermolysis Bullosa Journal Article In: PLOS Computational Biology, 14 (4), 2018. |
Vasodilator-stimulated phosphoprotein promotes liver metastasis of gastrointestinal cancer by activating a β1-integrin-FAK-YAP1/TAZ signaling pathway Journal Article Forthcoming In: npj Precision Oncology, Forthcoming. |
Scalable Label Propagation for Multi-relational Learning on Tensor Product Graph Conference arXiv, 2018. |
2017 |
Detecting Population-differentiation Copy Number Variants in Human Population Tree by Sparse Group Selection Journal Article Forthcoming In: IEEE/ACM Transactions on Computational Biology and Bioinformatics, 16 (2), pp. 538 - 549, Forthcoming. |
Network-based Machine Learning and Graph Theory Algorithms for Precision Oncology Journal Article In: NPJ Precision Oncology, (25), 2017. |
Global analysis of canola genes targeted by SHORT HYPOCOTYL UNDER BLUE 1 during endosperm and embryo development Journal Article In: The Plant Journal, 91 (1), pp. 158-171, 2017. |
Revealing complete complex KIR haplotypes phased by long-read sequencing technology Journal Article In: Genes Immunity, 1-8 , 2017. |
2016 |
Transfer Learning across Ontologies for Phenome-Genome Association Prediction Journal Article In: Bioinformatics, 33 (4), pp. 529-536, 2016. |
In: PloS one, 11 (10), pp. e0163973, 2016. |
Meta-Analysis of EMT Datasets Reveals Different Types of EMT. Journal Article In: PloS one, 11 (6), pp. e0156839–e0156839, 2016. |
2015 |
Network-based Isoform Quantification with RNA-Seq Data for Cancer Transcriptome Analysis Journal Article In: PLoS Computational Biology, e1004465 , 2015. |
mRNA 3'-UTR shortening is a molecular signature of mTORC1 activation Journal Article In: Nature communications, 6 , 2015. |
Network-based Phenome-Genome Association Prediction by Bi-Random Walk Journal Article In: PloS one, 10 (5), pp. e0125138, 2015. |
TP53 mutations, tetraploidy and homologous recombination repair defects in early stage high-grade serous ovarian cancer Journal Article In: Nucleic acids research, pp. gkv111, 2015. |
SubPatCNV: approximate subspace pattern mining for mapping copy-number variations Journal Article In: BMC bioinformatics, 16 (1), pp. 1, 2015, ISSN: 1471-2105. |
Predicting and exploring network components involved in pathogenesis in the malaria parasite via novel subnetwork alignments Journal Article In: BMC systems biology, 9 (4), pp. 1, 2015. |
Predicting small group accretion in social networks: A topology based incremental approach Inproceedings In: 2015 IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining (ASONAM), pp. 408–415, IEEE 2015, ISBN: 978-1-4503-3854-7/15/08. |
2013 |
Transfer learning across cancers on DNA copy number variation analysis Inproceedings In: 2013 IEEE 13th International Conference on Data Mining, pp. 1283–1288, IEEE IEEE, 2013, ISBN: 978-0-7695-5108-1. |
A novel subnetwork alignment approach predicts new components of the cell cycle regulatory apparatus in Plasmodium falciparum Journal Article In: BMC bioinformatics, 14 (12), pp. 1, 2013, ISSN: 1471-2105. |
Platinum-sensitive recurrence in ovarian cancer: the role of tumor microenvironment Journal Article In: Frontiers in oncology, 3 , pp. 251, 2013. |