From e6821943fe5934972dde42014880eff0c448b9ab Mon Sep 17 00:00:00 2001 From: Vitor Trovisco <vmt25@cam.ac.uk> Date: Tue, 28 Sep 2021 12:28:04 +0000 Subject: [PATCH] Update list-to-extension_config.R - improved commenting --- .../list-to-extension_config.R | 109 +++++++++++------- 1 file changed, 65 insertions(+), 44 deletions(-) diff --git a/extension_config-Rscript/list-to-extension_config.R b/extension_config-Rscript/list-to-extension_config.R index be48b40..8033296 100644 --- a/extension_config-Rscript/list-to-extension_config.R +++ b/extension_config-Rscript/list-to-extension_config.R @@ -1,41 +1,55 @@ -# This Rscript creates an extension_config.tsv file that defines which qualifiers (extensions in pombase lingo) can go with which type of annotation, specified in 'allowed_qualifiers.tsv' -# The list of allowed qualifiers comes as an array of namespaces -# The annotation types are defined as the top term IDs for the type of annotation - # phenotypic class - FBcv:0000347 - # anatomy/manifests uses two ontologies and thus needs two ids: +# This Rscript creates an extension_config.tsv file, which configures which qualifiers (extensions in PomBase lingo) are allowed with which annotation type, +# The input list of allowed qualifiers (in 'allowed_qualifiers.tsv') comes as an array of namespaces. However, these have to be converted into the parent cv term IDs that have that namespace. +# In addition, annotation types in extension_config.tsv are defined as their top/parent term IDs. + # So: + # for the phenotypic class annotation/ontology, the top CV term 'phenotypic class' is selected by its ontology ID 'FBcv:0000347' + # for anatomy ontology, as it is a meta-ontology of the merge of the FlyBase Anatomy ontology and the GO cellular component ontology, it needs the top term IDs of both of them: # flybase anatomy - FBbt:10000000 - # GO CC - GO:0005575. -# extension_config.tsv only uses IDs and, therefore, qualifier namespaces must be converted into their topterm IDs, whcih requires screening the FBcv obo file. + # GO CC - GO:0005575 +#cleanup R rm(list=ls()) + #set work directory as canto-space setwd("./") -# 1 - pick the default namespaces from the FBcv ontology -##tempFBcvobo.txt is a derivation of the original FBcv obo file, stripped of text, etc, and with ':' replaced by '\t' so that R can read it as a 2 colmun table + + +# 1 - state the namespace for every term in the FBcv ontology + +# list-to-extension_config.sh derived tempFBcvobo.txt from the starting FBcv obo file, stripping text, etc, and replacing ':' with '\t', so that +# R can import tempFBcvobo.txt as a 2-column table in the obo variable : obo<-read.table("./extension_config-Rscript/tempFBcvobo.txt", sep='\t', fill=TRUE, quote = "") obo[,1]<-as.character(obo[,1]) obo[,2]<-as.character(obo[,2]) +#get which column corresponds to an id ids<-which(obo[,1]=="id") idl<-length(ids) + +# a variable will be used to count non-obsolete terms a<-0 -#Create a working table for term-term relationships, etc +# Create an empty working table for term-term relationships, etc Df<-data.frame(matrix(ncol=6), stringsAsFactors = FALSE) colnames(Df)<-c("cv term","name","namespace","parent term","top namespace", "topmost namespace?") -#loop to remove obsolete terms, and move CV, name, namespace, and relation into separate columns +# loop to go through the term ids and remove obsolete terms for (i in 1:(idl-1)) { obo1<-obo[(ids[i]:ids[i+1]),] obsolete<-which(obo1[,1]=="is_obsolete", obo1[,2]=="true") - if (length(obsolete)==0){ + #if not obsolete, i.g length(obsolete)==0, move CV, name, namespace, and relation into separate columns + if (length(obsolete)==0){ a=a+1 + #put CV id on the 1st column Df[a,1]=obo[ids[i],2] + #put CV term/name on the 2nd column Df[a,2]=obo[ids[i]+1,2] + #if the CV namespace is stated, put it on the 3rd column b<-which(obo1[,1]=="namespace") if (length(b)==1){ Df[a,3]<-obo[(ids[i]+b-1),2] } + #if a "is_a" parent-child relationship is stated, put it on the 4th column c<-which(obo1[,1]=="is_a") if (length(c)==1){ Df[a,4]<-obo[(ids[i]+c-1),2] @@ -52,25 +66,25 @@ if (length(obsolete)==0){ Df[a,2]=obo[ids[i+1]+1,2] b=which(obo1[,1]=="namespace") if (length(b)==1){ - Df[a,3]<-obo[(ids[i+1]+b),2] + Df[a,3]<-obo[(ids[i+1]+b),2] } c<-which(obo1[,1]=="is_a") if (length(c)==1){ - Df[a,4]<-obo[(ids[i+1]+c-1),2] + Df[a,4]<-obo[(ids[i+1]+c-1),2] } } -#for the is_a field, this will crop the CV term (e.g. FBcv:0000001 -> 4 letters+colon+7 numbers, adding up to 12 characters) +#for the is_a field, on the 4th column (Df[,4]), this step will extract the CV term and exclude all rest (e.g. 'is_a FBcv:0000683 ! temperature response defective' only keep FBcv:0000683, by keeping the 1st to 12th characters: 4 letters+colon+7 numbers = 12 characters) Df[,4]<-substr(Df[,4], 1, 12) -#Loop to pick the top-most namespace, using the is_a parent-child relationships -## 1 - set top namespace +#Loop through rows/terms to pick the most granular namespace possible, using the is_a parent-child relationships +## 1st, add the to-most namespace to the top term of the ontology Df[which(Df[,1]=="FBcv:0000000"),3]<-"FlyBase_miscellaneous_CV" -## 2 - if, existing, pick the current namespace +## 2nd - if stated, pick the current namespace (on the 3rd column) as the 'top namespace' (add to the 5th column) for(j in 1:nrow(Df)){ if(is.na(Df[j,3])==FALSE){Df[j,5]<-Df[j,3]} } -## 3 - otherwise, go through all childs of a term and, if child does not have a namespace, pick the parent namespace +## 3rd - otherwise, go through all childs of a term and, if it does not have a namespace assigned (on the 3rd column), pick the parent's namespace as its 'top namespace' (add to the 5th column) rm(j) for(j in 1:nrow(Df)){ childs<-which(Df[,4]==Df[j,1]) @@ -78,7 +92,7 @@ for(j in 1:nrow(Df)){ for(k in 1:length(childs)){ if(is.na(Df[childs[k],5])==FALSE){} else{if(is.na(Df[j,3])==FALSE){ - Df[childs[k],5]<-Df[j,3]} + Df[childs[k],5]<-Df[j,3]} else{ if(is.na(Df[j,5])==FALSE){Df[childs[k],5]<-Df[j,5]} } @@ -94,14 +108,17 @@ for(j in 1:nrow(Df)){ } -# create the template for the 'extension_config.tsv' file + +# 2 - create the 'extension_config.tsv' file + +# create template table for the 'extension_config.tsv' file Df1<-data.frame(matrix(ncol=8), stringsAsFactors = FALSE) colnames(Df1)<-c("domain ID", "subset relation", "extension relation", "range ID", "Canto display text", "Help text", "cardinality", "role") #read file with list of allowed qualifiers (on "allowed_qualifiers.tsv") quallist<-read.table("./extension_config-Rscript/allowed_qualifiers.tsv", sep="\t", header=TRUE, stringsAsFactors=FALSE) -#for each type of allowed qualifier, replace the namespace with corresponding top term with that namespace +#for each type of allowed qualifier, defined by namespace, replace the namespace with the id of the top-most term of that namespace count<-1 phenpos<-c() for(m in 1:nrow(quallist)){ @@ -115,6 +132,8 @@ for(m in 1:nrow(quallist)){ } phenqual1[l]<-paste(phenqual,collapse = "|") } + + # make list of term ids of allowed qualifers for phenotype-type of annotations - phenotype calls and anatomy if(quallist[m,2]=='phenotype'){ #if the annotations are for 'phenotypic class' (FBcv:0000347) then: # a) the single term 'progressive' shoud be allowed @@ -123,13 +142,13 @@ for(m in 1:nrow(quallist)){ phenqual1<-append(phenqual1, Df[which(Df[,2]=="progressive"), 1]) Df1[count,4]<-paste(phenqual1,collapse = '|') count<-count+1 - # b) and 'the 'fertility/sterility terms must not allow developmental stage terms - ##once the ontology file is updated so that all viability/mortality/life-span terms are under a common parent term, this next bit should be updated + # b) and 'the 'fertility/sterility terms must not allow developmental stage qualifiers + ## in the future, when the ontology is updated so that viability/mortality/life-span terms are all under a common parent, this next bit should be updated viable<-c("viable","increased mortality during development","short lived","long lived") for (n in 1:length(viable)){ - viable[n]<-Df[which(Df[,2]==viable[n]),1] + viable[n]<-Df[which(Df[,2]==viable[n]),1] } - #once the ontology file is updated to put term with decreased fertility under a common parent term (e.g. 'decreased fertily'), the next bit should be updated to refer to the new top term + # in the future, when the ontology is updated so that decreased fertility terms are all under a common parent term (e.g. 'decreased fertily'), the next bit should be updated to refer to the new top term sterile<-c("fertile", "sterile","semi-sterile","semi-fertile") for (o in 1:length(sterile)){ sterile[o]<-Df[which(Df[,2]==sterile[o]),1] @@ -142,18 +161,20 @@ for(m in 1:nrow(quallist)){ Df1[count,7]<-"0,1" count<-count+1 } - else{ - Df1[count,1]<-quallist[m,1] - Df1[count,4]<-paste(phenqual1, collapse = '|') - count<-count+1 - Df1[count,1]<-quallist[m,1] - Df1[count,4]<-"FBdv:00007013|FBdv:00007024|FBdv:00005259|FBdv:00000000" - Df1[count,3]<-"dv_qual" - Df1[count,5]<-"Developmental stage qualifier" - Df1[count,7]<-"0,1" - count<-count+1 - } + else{ + Df1[count,1]<-quallist[m,1] + Df1[count,4]<-paste(phenqual1, collapse = '|') + count<-count+1 + Df1[count,1]<-quallist[m,1] + Df1[count,4]<-"FBdv:00007013|FBdv:00007024|FBdv:00005259|FBdv:00000000" + Df1[count,3]<-"dv_qual" + Df1[count,5]<-"Developmental stage qualifier" + Df1[count,7]<-"0,1" + count<-count+1 + } } + + # make list of term ids of allowed qualifers for genetic interaction-type of annotations if(quallist[m,2]=='genetic interaction'){ Df1[count,1]<-quallist[m,1] Df1[count,4]<-paste(phenqual1, collapse = '|') @@ -165,19 +186,19 @@ for(m in 1:nrow(quallist)){ for (j in 1:nrow(Df1)){ -if (is.na(Df1[j,2])==TRUE){Df1[j,2]<-as.character("is_a")} + if (is.na(Df1[j,2])==TRUE){Df1[j,2]<-as.character("is_a")} else{} -if (is.na(Df1[j,3])==TRUE){Df1[j,3]<-as.character("qual")} + if (is.na(Df1[j,3])==TRUE){Df1[j,3]<-as.character("qual")} else{} -if (is.na(Df1[j,5])==TRUE){Df1[j,5]<-as.character("Qualifier")} + if (is.na(Df1[j,5])==TRUE){Df1[j,5]<-as.character("Qualifier")} else{} -if (is.na(Df1[j,6])==TRUE){Df1[j,6]<-as.character("")} + if (is.na(Df1[j,6])==TRUE){Df1[j,6]<-as.character("")} else{} -if (is.na(Df1[j,7])==TRUE){Df1[j,7]<-as.character("*")} + if (is.na(Df1[j,7])==TRUE){Df1[j,7]<-as.character("*")} else{} -if (is.na(Df1[j,8])==TRUE){Df1[j,8]<-as.character("user")} + if (is.na(Df1[j,8])==TRUE){Df1[j,8]<-as.character("user")} else{} } +# Write Df1 table into the extension_config.tsv file write.table(Df1, file = "./canto/extension_config.tsv", sep='\t', quote= FALSE, row.names = FALSE) -#for each type of allowed qualifier, replace the namespace with corresponding top term with that namespace -- GitLab