FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
Commit e6821943 authored by Vitor Trovisco's avatar Vitor Trovisco
Browse files

Update list-to-extension_config.R - improved commenting

parent fc4dc2c7
No related branches found
No related tags found
No related merge requests found
# This Rscript creates an extension_config.tsv file that defines which qualifiers (extensions in pombase lingo) can go with which type of annotation, specified in 'allowed_qualifiers.tsv'
# The list of allowed qualifiers comes as an array of namespaces
# The annotation types are defined as the top term IDs for the type of annotation
# phenotypic class - FBcv:0000347
# anatomy/manifests uses two ontologies and thus needs two ids:
# This Rscript creates an extension_config.tsv file, which configures which qualifiers (extensions in PomBase lingo) are allowed with which annotation type,
# The input list of allowed qualifiers (in 'allowed_qualifiers.tsv') comes as an array of namespaces. However, these have to be converted into the parent cv term IDs that have that namespace.
# In addition, annotation types in extension_config.tsv are defined as their top/parent term IDs.
# So:
# for the phenotypic class annotation/ontology, the top CV term 'phenotypic class' is selected by its ontology ID 'FBcv:0000347'
# for anatomy ontology, as it is a meta-ontology of the merge of the FlyBase Anatomy ontology and the GO cellular component ontology, it needs the top term IDs of both of them:
# flybase anatomy - FBbt:10000000
# GO CC - GO:0005575.
# extension_config.tsv only uses IDs and, therefore, qualifier namespaces must be converted into their topterm IDs, whcih requires screening the FBcv obo file.
# GO CC - GO:0005575
#cleanup R
rm(list=ls())
#set work directory as canto-space
setwd("./")
# 1 - pick the default namespaces from the FBcv ontology
##tempFBcvobo.txt is a derivation of the original FBcv obo file, stripped of text, etc, and with ':' replaced by '\t' so that R can read it as a 2 colmun table
# 1 - state the namespace for every term in the FBcv ontology
# list-to-extension_config.sh derived tempFBcvobo.txt from the starting FBcv obo file, stripping text, etc, and replacing ':' with '\t', so that
# R can import tempFBcvobo.txt as a 2-column table in the obo variable :
obo<-read.table("./extension_config-Rscript/tempFBcvobo.txt", sep='\t', fill=TRUE, quote = "")
obo[,1]<-as.character(obo[,1])
obo[,2]<-as.character(obo[,2])
#get which column corresponds to an id
ids<-which(obo[,1]=="id")
idl<-length(ids)
# a variable will be used to count non-obsolete terms
a<-0
#Create a working table for term-term relationships, etc
# Create an empty working table for term-term relationships, etc
Df<-data.frame(matrix(ncol=6), stringsAsFactors = FALSE)
colnames(Df)<-c("cv term","name","namespace","parent term","top namespace", "topmost namespace?")
#loop to remove obsolete terms, and move CV, name, namespace, and relation into separate columns
# loop to go through the term ids and remove obsolete terms
for (i in 1:(idl-1)) {
obo1<-obo[(ids[i]:ids[i+1]),]
obsolete<-which(obo1[,1]=="is_obsolete", obo1[,2]=="true")
if (length(obsolete)==0){
#if not obsolete, i.g length(obsolete)==0, move CV, name, namespace, and relation into separate columns
if (length(obsolete)==0){
a=a+1
#put CV id on the 1st column
Df[a,1]=obo[ids[i],2]
#put CV term/name on the 2nd column
Df[a,2]=obo[ids[i]+1,2]
#if the CV namespace is stated, put it on the 3rd column
b<-which(obo1[,1]=="namespace")
if (length(b)==1){
Df[a,3]<-obo[(ids[i]+b-1),2]
}
#if a "is_a" parent-child relationship is stated, put it on the 4th column
c<-which(obo1[,1]=="is_a")
if (length(c)==1){
Df[a,4]<-obo[(ids[i]+c-1),2]
......@@ -52,25 +66,25 @@ if (length(obsolete)==0){
Df[a,2]=obo[ids[i+1]+1,2]
b=which(obo1[,1]=="namespace")
if (length(b)==1){
Df[a,3]<-obo[(ids[i+1]+b),2]
Df[a,3]<-obo[(ids[i+1]+b),2]
}
c<-which(obo1[,1]=="is_a")
if (length(c)==1){
Df[a,4]<-obo[(ids[i+1]+c-1),2]
Df[a,4]<-obo[(ids[i+1]+c-1),2]
}
}
#for the is_a field, this will crop the CV term (e.g. FBcv:0000001 -> 4 letters+colon+7 numbers, adding up to 12 characters)
#for the is_a field, on the 4th column (Df[,4]), this step will extract the CV term and exclude all rest (e.g. 'is_a FBcv:0000683 ! temperature response defective' only keep FBcv:0000683, by keeping the 1st to 12th characters: 4 letters+colon+7 numbers = 12 characters)
Df[,4]<-substr(Df[,4], 1, 12)
#Loop to pick the top-most namespace, using the is_a parent-child relationships
## 1 - set top namespace
#Loop through rows/terms to pick the most granular namespace possible, using the is_a parent-child relationships
## 1st, add the to-most namespace to the top term of the ontology
Df[which(Df[,1]=="FBcv:0000000"),3]<-"FlyBase_miscellaneous_CV"
## 2 - if, existing, pick the current namespace
## 2nd - if stated, pick the current namespace (on the 3rd column) as the 'top namespace' (add to the 5th column)
for(j in 1:nrow(Df)){
if(is.na(Df[j,3])==FALSE){Df[j,5]<-Df[j,3]}
}
## 3 - otherwise, go through all childs of a term and, if child does not have a namespace, pick the parent namespace
## 3rd - otherwise, go through all childs of a term and, if it does not have a namespace assigned (on the 3rd column), pick the parent's namespace as its 'top namespace' (add to the 5th column)
rm(j)
for(j in 1:nrow(Df)){
childs<-which(Df[,4]==Df[j,1])
......@@ -78,7 +92,7 @@ for(j in 1:nrow(Df)){
for(k in 1:length(childs)){
if(is.na(Df[childs[k],5])==FALSE){}
else{if(is.na(Df[j,3])==FALSE){
Df[childs[k],5]<-Df[j,3]}
Df[childs[k],5]<-Df[j,3]}
else{
if(is.na(Df[j,5])==FALSE){Df[childs[k],5]<-Df[j,5]}
}
......@@ -94,14 +108,17 @@ for(j in 1:nrow(Df)){
}
# create the template for the 'extension_config.tsv' file
# 2 - create the 'extension_config.tsv' file
# create template table for the 'extension_config.tsv' file
Df1<-data.frame(matrix(ncol=8), stringsAsFactors = FALSE)
colnames(Df1)<-c("domain ID", "subset relation", "extension relation", "range ID", "Canto display text", "Help text", "cardinality", "role")
#read file with list of allowed qualifiers (on "allowed_qualifiers.tsv")
quallist<-read.table("./extension_config-Rscript/allowed_qualifiers.tsv", sep="\t", header=TRUE, stringsAsFactors=FALSE)
#for each type of allowed qualifier, replace the namespace with corresponding top term with that namespace
#for each type of allowed qualifier, defined by namespace, replace the namespace with the id of the top-most term of that namespace
count<-1
phenpos<-c()
for(m in 1:nrow(quallist)){
......@@ -115,6 +132,8 @@ for(m in 1:nrow(quallist)){
}
phenqual1[l]<-paste(phenqual,collapse = "|")
}
# make list of term ids of allowed qualifers for phenotype-type of annotations - phenotype calls and anatomy
if(quallist[m,2]=='phenotype'){
#if the annotations are for 'phenotypic class' (FBcv:0000347) then:
# a) the single term 'progressive' shoud be allowed
......@@ -123,13 +142,13 @@ for(m in 1:nrow(quallist)){
phenqual1<-append(phenqual1, Df[which(Df[,2]=="progressive"), 1])
Df1[count,4]<-paste(phenqual1,collapse = '|')
count<-count+1
# b) and 'the 'fertility/sterility terms must not allow developmental stage terms
##once the ontology file is updated so that all viability/mortality/life-span terms are under a common parent term, this next bit should be updated
# b) and 'the 'fertility/sterility terms must not allow developmental stage qualifiers
## in the future, when the ontology is updated so that viability/mortality/life-span terms are all under a common parent, this next bit should be updated
viable<-c("viable","increased mortality during development","short lived","long lived")
for (n in 1:length(viable)){
viable[n]<-Df[which(Df[,2]==viable[n]),1]
viable[n]<-Df[which(Df[,2]==viable[n]),1]
}
#once the ontology file is updated to put term with decreased fertility under a common parent term (e.g. 'decreased fertily'), the next bit should be updated to refer to the new top term
# in the future, when the ontology is updated so that decreased fertility terms are all under a common parent term (e.g. 'decreased fertily'), the next bit should be updated to refer to the new top term
sterile<-c("fertile", "sterile","semi-sterile","semi-fertile")
for (o in 1:length(sterile)){
sterile[o]<-Df[which(Df[,2]==sterile[o]),1]
......@@ -142,18 +161,20 @@ for(m in 1:nrow(quallist)){
Df1[count,7]<-"0,1"
count<-count+1
}
else{
Df1[count,1]<-quallist[m,1]
Df1[count,4]<-paste(phenqual1, collapse = '|')
count<-count+1
Df1[count,1]<-quallist[m,1]
Df1[count,4]<-"FBdv:00007013|FBdv:00007024|FBdv:00005259|FBdv:00000000"
Df1[count,3]<-"dv_qual"
Df1[count,5]<-"Developmental stage qualifier"
Df1[count,7]<-"0,1"
count<-count+1
}
else{
Df1[count,1]<-quallist[m,1]
Df1[count,4]<-paste(phenqual1, collapse = '|')
count<-count+1
Df1[count,1]<-quallist[m,1]
Df1[count,4]<-"FBdv:00007013|FBdv:00007024|FBdv:00005259|FBdv:00000000"
Df1[count,3]<-"dv_qual"
Df1[count,5]<-"Developmental stage qualifier"
Df1[count,7]<-"0,1"
count<-count+1
}
}
# make list of term ids of allowed qualifers for genetic interaction-type of annotations
if(quallist[m,2]=='genetic interaction'){
Df1[count,1]<-quallist[m,1]
Df1[count,4]<-paste(phenqual1, collapse = '|')
......@@ -165,19 +186,19 @@ for(m in 1:nrow(quallist)){
for (j in 1:nrow(Df1)){
if (is.na(Df1[j,2])==TRUE){Df1[j,2]<-as.character("is_a")}
if (is.na(Df1[j,2])==TRUE){Df1[j,2]<-as.character("is_a")}
else{}
if (is.na(Df1[j,3])==TRUE){Df1[j,3]<-as.character("qual")}
if (is.na(Df1[j,3])==TRUE){Df1[j,3]<-as.character("qual")}
else{}
if (is.na(Df1[j,5])==TRUE){Df1[j,5]<-as.character("Qualifier")}
if (is.na(Df1[j,5])==TRUE){Df1[j,5]<-as.character("Qualifier")}
else{}
if (is.na(Df1[j,6])==TRUE){Df1[j,6]<-as.character("")}
if (is.na(Df1[j,6])==TRUE){Df1[j,6]<-as.character("")}
else{}
if (is.na(Df1[j,7])==TRUE){Df1[j,7]<-as.character("*")}
if (is.na(Df1[j,7])==TRUE){Df1[j,7]<-as.character("*")}
else{}
if (is.na(Df1[j,8])==TRUE){Df1[j,8]<-as.character("user")}
if (is.na(Df1[j,8])==TRUE){Df1[j,8]<-as.character("user")}
else{}
}
# Write Df1 table into the extension_config.tsv file
write.table(Df1, file = "./canto/extension_config.tsv", sep='\t', quote= FALSE, row.names = FALSE)
#for each type of allowed qualifier, replace the namespace with corresponding top term with that namespace
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment