From e6821943fe5934972dde42014880eff0c448b9ab Mon Sep 17 00:00:00 2001
From: Vitor Trovisco <vmt25@cam.ac.uk>
Date: Tue, 28 Sep 2021 12:28:04 +0000
Subject: [PATCH] Update list-to-extension_config.R - improved commenting

---
 .../list-to-extension_config.R                | 109 +++++++++++-------
 1 file changed, 65 insertions(+), 44 deletions(-)

diff --git a/extension_config-Rscript/list-to-extension_config.R b/extension_config-Rscript/list-to-extension_config.R
index be48b40..8033296 100644
--- a/extension_config-Rscript/list-to-extension_config.R
+++ b/extension_config-Rscript/list-to-extension_config.R
@@ -1,41 +1,55 @@
-# This Rscript creates an extension_config.tsv file that defines which qualifiers (extensions in pombase lingo) can go with which type of annotation, specified in 'allowed_qualifiers.tsv'
-# The list of allowed qualifiers comes as an array of namespaces
-# The annotation types are defined as the top term IDs for the type of annotation
-  # phenotypic class - FBcv:0000347
-  # anatomy/manifests uses two ontologies and thus needs two ids:
+# This Rscript creates an extension_config.tsv file, which configures which qualifiers (extensions in PomBase lingo) are allowed with which annotation type, 
+# The input list of allowed qualifiers (in 'allowed_qualifiers.tsv') comes as an array of namespaces. However, these have to be converted into the parent cv term IDs that have that namespace.
+# In addition, annotation types in extension_config.tsv are defined as their top/parent term IDs.
+  # So:
+    # for the phenotypic class annotation/ontology, the top CV term 'phenotypic class' is selected by its ontology ID 'FBcv:0000347'
+    # for anatomy ontology, as it is a meta-ontology of the merge of the FlyBase Anatomy ontology and the GO cellular component ontology, it needs the top term IDs of both of them:
     # flybase anatomy - FBbt:10000000
-    # GO CC - GO:0005575.
-# extension_config.tsv only uses IDs and, therefore, qualifier namespaces must be converted into their topterm IDs, whcih requires screening the FBcv obo file.
+    # GO CC - GO:0005575
 
+#cleanup R
 rm(list=ls())
+
 #set work directory as canto-space
 setwd("./")
 
-# 1 - pick the default namespaces from the FBcv ontology 
-##tempFBcvobo.txt is a derivation of the original FBcv obo file, stripped of text, etc, and with ':' replaced by '\t' so that R can read it as a 2 colmun table
+
+
+# 1 - state the namespace for every term in the FBcv ontology
+
+# list-to-extension_config.sh derived tempFBcvobo.txt from the starting FBcv obo file, stripping text, etc, and replacing ':' with '\t', so that
+# R can import tempFBcvobo.txt as a 2-column table in the obo variable :
 obo<-read.table("./extension_config-Rscript/tempFBcvobo.txt", sep='\t', fill=TRUE, quote = "")
 obo[,1]<-as.character(obo[,1])
 obo[,2]<-as.character(obo[,2])
+#get which column corresponds to an id
 ids<-which(obo[,1]=="id")
 idl<-length(ids)
+
+# a variable will be used to count non-obsolete terms
 a<-0
 
-#Create a working table for term-term relationships, etc
+# Create an empty working table for term-term relationships, etc
 Df<-data.frame(matrix(ncol=6), stringsAsFactors = FALSE)
 colnames(Df)<-c("cv term","name","namespace","parent term","top namespace", "topmost namespace?")
 
-#loop to remove obsolete terms, and move CV, name, namespace, and relation into separate columns
+# loop to go through the term ids and remove obsolete terms
 for (i in 1:(idl-1)) {
   obo1<-obo[(ids[i]:ids[i+1]),]
   obsolete<-which(obo1[,1]=="is_obsolete", obo1[,2]=="true")
-    if (length(obsolete)==0){
+  #if not obsolete, i.g length(obsolete)==0, move CV, name, namespace, and relation into separate columns
+  if (length(obsolete)==0){
     a=a+1
+    #put CV id on the 1st column
     Df[a,1]=obo[ids[i],2]
+    #put CV term/name on the 2nd column
     Df[a,2]=obo[ids[i]+1,2]
+    #if the CV namespace is stated, put it on the 3rd column
     b<-which(obo1[,1]=="namespace")
     if (length(b)==1){
       Df[a,3]<-obo[(ids[i]+b-1),2]
     }
+    #if a "is_a" parent-child relationship is stated, put it on the 4th column
     c<-which(obo1[,1]=="is_a")
     if (length(c)==1){
       Df[a,4]<-obo[(ids[i]+c-1),2]
@@ -52,25 +66,25 @@ if (length(obsolete)==0){
   Df[a,2]=obo[ids[i+1]+1,2]
   b=which(obo1[,1]=="namespace")
   if (length(b)==1){
-  Df[a,3]<-obo[(ids[i+1]+b),2]
+    Df[a,3]<-obo[(ids[i+1]+b),2]
   }
   c<-which(obo1[,1]=="is_a")
   if (length(c)==1){
-  Df[a,4]<-obo[(ids[i+1]+c-1),2]
+    Df[a,4]<-obo[(ids[i+1]+c-1),2]
   }
 }
 
-#for the is_a field, this will crop the CV term (e.g. FBcv:0000001 -> 4 letters+colon+7 numbers, adding up to 12 characters)
+#for the is_a field, on the 4th column (Df[,4]), this step will extract the CV term and exclude all rest (e.g. 'is_a	FBcv:0000683 ! temperature response defective' only keep FBcv:0000683, by keeping the 1st to 12th characters: 4 letters+colon+7 numbers = 12 characters)
 Df[,4]<-substr(Df[,4], 1, 12)
 
-#Loop to pick the top-most namespace, using the is_a parent-child relationships
-## 1 - set top namespace
+#Loop through rows/terms to pick the most granular namespace possible, using the is_a parent-child relationships
+## 1st, add the to-most namespace to the top term of the ontology
 Df[which(Df[,1]=="FBcv:0000000"),3]<-"FlyBase_miscellaneous_CV"
-## 2 - if, existing, pick the current namespace
+## 2nd - if stated, pick the current namespace (on the 3rd column) as the 'top namespace' (add to the 5th column)
 for(j in 1:nrow(Df)){
   if(is.na(Df[j,3])==FALSE){Df[j,5]<-Df[j,3]}
 }
-## 3 - otherwise, go through all childs of a term and, if child does not have a namespace, pick the parent namespace
+## 3rd - otherwise, go through all childs of a term and, if it does not have a namespace assigned (on the 3rd column), pick the parent's namespace as its 'top namespace' (add to the 5th column) 
 rm(j)
 for(j in 1:nrow(Df)){
   childs<-which(Df[,4]==Df[j,1])
@@ -78,7 +92,7 @@ for(j in 1:nrow(Df)){
     for(k in 1:length(childs)){
       if(is.na(Df[childs[k],5])==FALSE){}
       else{if(is.na(Df[j,3])==FALSE){
-      Df[childs[k],5]<-Df[j,3]}
+        Df[childs[k],5]<-Df[j,3]}
         else{
           if(is.na(Df[j,5])==FALSE){Df[childs[k],5]<-Df[j,5]}
         }
@@ -94,14 +108,17 @@ for(j in 1:nrow(Df)){
 }
 
 
-# create the template for the 'extension_config.tsv' file
+
+# 2 - create the 'extension_config.tsv' file
+
+# create template table for the 'extension_config.tsv' file
 Df1<-data.frame(matrix(ncol=8), stringsAsFactors = FALSE)
 colnames(Df1)<-c("domain ID", "subset relation", "extension relation", "range ID", "Canto display text", "Help text", "cardinality", "role")
 
 #read file with list of allowed qualifiers (on "allowed_qualifiers.tsv")
 quallist<-read.table("./extension_config-Rscript/allowed_qualifiers.tsv", sep="\t", header=TRUE, stringsAsFactors=FALSE)
 
-#for each type of allowed qualifier, replace the namespace with corresponding top term with that namespace
+#for each type of allowed qualifier, defined by namespace, replace the namespace with the id of the top-most term of that namespace
 count<-1
 phenpos<-c()
 for(m in 1:nrow(quallist)){
@@ -115,6 +132,8 @@ for(m in 1:nrow(quallist)){
     }
     phenqual1[l]<-paste(phenqual,collapse = "|")
   }
+  
+  # make list of term ids of allowed qualifers for phenotype-type of annotations - phenotype calls and anatomy 
   if(quallist[m,2]=='phenotype'){
     #if the annotations are for 'phenotypic class' (FBcv:0000347) then:
     # a) the single term 'progressive' shoud be allowed
@@ -123,13 +142,13 @@ for(m in 1:nrow(quallist)){
       phenqual1<-append(phenqual1, Df[which(Df[,2]=="progressive"), 1])
       Df1[count,4]<-paste(phenqual1,collapse = '|')
       count<-count+1
-    # b) and 'the 'fertility/sterility terms must not allow developmental stage terms
-    ##once the ontology file is updated so that all viability/mortality/life-span terms are under a common parent term, this next bit should be updated
+      # b) and 'the 'fertility/sterility terms must not allow developmental stage qualifiers
+      ## in the future, when the ontology is updated so that viability/mortality/life-span terms are all under a common parent, this next bit should be updated
       viable<-c("viable","increased mortality during development","short lived","long lived")
       for (n in 1:length(viable)){ 
-      viable[n]<-Df[which(Df[,2]==viable[n]),1]
+        viable[n]<-Df[which(Df[,2]==viable[n]),1]
       }
-      #once the ontology file is updated to put term with decreased fertility under a common parent term (e.g. 'decreased fertily'), the next bit should be updated to refer to the new top term
+      # in the future, when the ontology is updated so that decreased fertility terms are all under a common parent term (e.g. 'decreased fertily'), the next bit should be updated to refer to the new top term
       sterile<-c("fertile", "sterile","semi-sterile","semi-fertile")
       for (o in 1:length(sterile)){
         sterile[o]<-Df[which(Df[,2]==sterile[o]),1]
@@ -142,18 +161,20 @@ for(m in 1:nrow(quallist)){
       Df1[count,7]<-"0,1"
       count<-count+1
     }
-  else{
-    Df1[count,1]<-quallist[m,1]
-    Df1[count,4]<-paste(phenqual1, collapse = '|')
-    count<-count+1
-    Df1[count,1]<-quallist[m,1]
-    Df1[count,4]<-"FBdv:00007013|FBdv:00007024|FBdv:00005259|FBdv:00000000"
-    Df1[count,3]<-"dv_qual"
-    Df1[count,5]<-"Developmental stage qualifier"
-    Df1[count,7]<-"0,1"
-    count<-count+1
-  }
+    else{
+      Df1[count,1]<-quallist[m,1]
+      Df1[count,4]<-paste(phenqual1, collapse = '|')
+      count<-count+1
+      Df1[count,1]<-quallist[m,1]
+      Df1[count,4]<-"FBdv:00007013|FBdv:00007024|FBdv:00005259|FBdv:00000000"
+      Df1[count,3]<-"dv_qual"
+      Df1[count,5]<-"Developmental stage qualifier"
+      Df1[count,7]<-"0,1"
+      count<-count+1
+    }
   }
+  
+  # make list of term ids of allowed qualifers for genetic interaction-type of annotations
   if(quallist[m,2]=='genetic interaction'){
     Df1[count,1]<-quallist[m,1]
     Df1[count,4]<-paste(phenqual1, collapse = '|')
@@ -165,19 +186,19 @@ for(m in 1:nrow(quallist)){
 
 
 for (j in 1:nrow(Df1)){
-if (is.na(Df1[j,2])==TRUE){Df1[j,2]<-as.character("is_a")}
+  if (is.na(Df1[j,2])==TRUE){Df1[j,2]<-as.character("is_a")}
   else{}
-if (is.na(Df1[j,3])==TRUE){Df1[j,3]<-as.character("qual")}
+  if (is.na(Df1[j,3])==TRUE){Df1[j,3]<-as.character("qual")}
   else{}
-if (is.na(Df1[j,5])==TRUE){Df1[j,5]<-as.character("Qualifier")}
+  if (is.na(Df1[j,5])==TRUE){Df1[j,5]<-as.character("Qualifier")}
   else{}
-if (is.na(Df1[j,6])==TRUE){Df1[j,6]<-as.character("")}
+  if (is.na(Df1[j,6])==TRUE){Df1[j,6]<-as.character("")}
   else{}
-if (is.na(Df1[j,7])==TRUE){Df1[j,7]<-as.character("*")}
+  if (is.na(Df1[j,7])==TRUE){Df1[j,7]<-as.character("*")}
   else{}
-if (is.na(Df1[j,8])==TRUE){Df1[j,8]<-as.character("user")}
+  if (is.na(Df1[j,8])==TRUE){Df1[j,8]<-as.character("user")}
   else{}
 }
 
+# Write Df1 table into the extension_config.tsv file
 write.table(Df1, file = "./canto/extension_config.tsv", sep='\t', quote= FALSE, row.names = FALSE)
-#for each type of allowed qualifier, replace the namespace with corresponding top term with that namespace
-- 
GitLab