# This Rscript creates an extension_config.tsv file that defines which qualifiers (extensions in pombase lingo) can go with which type of annotation, specified in 'allowed_qualifiers.tsv'
# The list of allowed qualifiers comes as an array of namespaces
# The annotation types are defined as the top term IDs for the type of annotation
# phenotypic class - FBcv:0000347
# anatomy/manifests uses two ontologies and thus needs two ids:
# This Rscript creates an extension_config.tsv file, which configures which qualifiers (extensions in PomBase lingo) are allowed with which annotation type,
# The input list of allowed qualifiers (in 'allowed_qualifiers.tsv') comes as an array of namespaces. However, these have to be converted into the parent cv term IDs that have that namespace.
# In addition, annotation types in extension_config.tsv are defined as their top/parent term IDs.
# So:
# for the phenotypic class annotation/ontology, the top CV term 'phenotypic class' is selected by its ontology ID 'FBcv:0000347'
# for anatomy ontology, as it is a meta-ontology of the merge of the FlyBase Anatomy ontology and the GO cellular component ontology, it needs the top term IDs of both of them:
# flybase anatomy - FBbt:10000000
# GO CC - GO:0005575.
# extension_config.tsv only uses IDs and, therefore, qualifier namespaces must be converted into their topterm IDs, whcih requires screening the FBcv obo file.
# GO CC - GO:0005575
#cleanup R
rm(list=ls())
#set work directory as canto-space
setwd("./")
# 1 - pick the default namespaces from the FBcv ontology
##tempFBcvobo.txt is a derivation of the original FBcv obo file, stripped of text, etc, and with ':' replaced by '\t' so that R can read it as a 2 colmun table
# 1 - state the namespace for every term in the FBcv ontology
# list-to-extension_config.sh derived tempFBcvobo.txt from the starting FBcv obo file, stripping text, etc, and replacing ':' with '\t', so that
# R can import tempFBcvobo.txt as a 2-column table in the obo variable :
#if not obsolete, i.g length(obsolete)==0, move CV, name, namespace, and relation into separate columns
if(length(obsolete)==0){
a=a+1
#put CV id on the 1st column
Df[a,1]=obo[ids[i],2]
#put CV term/name on the 2nd column
Df[a,2]=obo[ids[i]+1,2]
#if the CV namespace is stated, put it on the 3rd column
b<-which(obo1[,1]=="namespace")
if(length(b)==1){
Df[a,3]<-obo[(ids[i]+b-1),2]
}
#if a "is_a" parent-child relationship is stated, put it on the 4th column
c<-which(obo1[,1]=="is_a")
if(length(c)==1){
Df[a,4]<-obo[(ids[i]+c-1),2]
...
...
@@ -52,25 +66,25 @@ if (length(obsolete)==0){
Df[a,2]=obo[ids[i+1]+1,2]
b=which(obo1[,1]=="namespace")
if(length(b)==1){
Df[a,3]<-obo[(ids[i+1]+b),2]
Df[a,3]<-obo[(ids[i+1]+b),2]
}
c<-which(obo1[,1]=="is_a")
if(length(c)==1){
Df[a,4]<-obo[(ids[i+1]+c-1),2]
Df[a,4]<-obo[(ids[i+1]+c-1),2]
}
}
#for the is_a field, this will crop the CV term (e.g. FBcv:0000001 -> 4 letters+colon+7 numbers, adding up to 12 characters)
#for the is_a field, on the 4th column (Df[,4]), this step will extract the CV term and exclude all rest (e.g. 'is_a FBcv:0000683 ! temperature response defective' only keep FBcv:0000683, by keeping the 1st to 12th characters: 4 letters+colon+7 numbers = 12 characters)
Df[,4]<-substr(Df[,4],1,12)
#Loop to pick the top-most namespace, using the is_a parent-child relationships
## 1 - set top namespace
#Loop through rows/terms to pick the most granular namespace possible, using the is_a parent-child relationships
## 1st, add the to-most namespace to the top term of the ontology
## 2nd - if stated, pick the current namespace (on the 3rd column) as the 'top namespace' (add to the 5th column)
for(jin1:nrow(Df)){
if(is.na(Df[j,3])==FALSE){Df[j,5]<-Df[j,3]}
}
## 3 - otherwise, go through all childs of a term and, if child does not have a namespace, pick the parent namespace
## 3rd - otherwise, go through all childs of a term and, if it does not have a namespace assigned (on the 3rd column), pick the parent's namespace as its 'top namespace' (add to the 5th column)
# b) and 'the 'fertility/sterility terms must not allow developmental stage terms
##once the ontology file is updated so that all viability/mortality/life-span terms are under a common parent term, this next bit should be updated
# b) and 'the 'fertility/sterility terms must not allow developmental stage qualifiers
## in the future, when the ontology is updated so that viability/mortality/life-span terms are all under a common parent, this next bit should be updated
viable<-c("viable","increased mortality during development","short lived","long lived")
for(nin1:length(viable)){
viable[n]<-Df[which(Df[,2]==viable[n]),1]
viable[n]<-Df[which(Df[,2]==viable[n]),1]
}
#once the ontology file is updated to put term with decreased fertility under a common parent term (e.g. 'decreased fertily'), the next bit should be updated to refer to the new top term
# in the future, when the ontology is updated so that decreased fertility terms are all under a common parent term (e.g. 'decreased fertily'), the next bit should be updated to refer to the new top term