upload

Estefani Gaytan Nunez
Commit 92c2d61e8012a5070f6bbf015c52cd08fa3643a2 92c2d61e 1 parent afa29bdb
Showing 56 changed files with 1597 additions and 3 deletions
extraction-geo/bin/DownloadProtocol_v3.R
mapping_MCO/bin/__pycache__/format_fun.cpython-36.pyc
mapping_MCO/bin/__pycache__/format_fun_v2.cpython-36.pyc
mapping_MCO/bin/__pycache__/format_fun_v6.cpython-36.pyc
mapping_MCO/bin/format_fun.py
mapping_MCO/bin/format_fun_v4.py
mapping_MCO/bin/format_fun_v6.py
mapping_MCO/bin/format_zika_v3.py
mapping_MCO/bin/format_zika_v4.py
mapping_MCO/bin/format_zika_v5.py
mapping_MCO/bin/mapping2MCO_v3.py
mapping_MCO/bin/mapping2MCO_v4.py
mapping_MCO/bin/mapping2MCO_v5.py
mapping_MCO/bin/mapping2MCO_v6.py
mapping_MCO/bin/step1_filter_curated_terms.py
mapping_MCO/input/No_GSM_Metadata_Selected_v4.tsv
mapping_MCO/input/correct_gc_terms_07_rev_Victor.csv
mapping_MCO/input/format_zika_v3.py
mapping_MCO/input/srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv
mapping_MCO/output/srr_htregulondb_mapped_map.json
--- a/extraction-geo/bin/DownloadProtocol_v3.R
View file @92c2d61
+++ b/extraction-geo/bin/DownloadProtocol_v3.R
View file @92c2d61
@@ -57,6 +57,12 @@ if (!length(opt)){
 ## Input files and output directories
 infoFile <- opt$infoFile
 
+ if (!"gse" %in% names(gseInfo)){
+   stop("include at least gse column")
+ }
+ if (!"gsm" %in% names(gseInfo)){
+   gseInfor$gsm <- "GSM"
+ }
 
 ## Load main variables
 
@@ -89,4 +95,4 @@ for (geoid in unique(gseInfo$gse)) {
 }
 cat("download id: ", length(list.dirs(opt$downloadPath, recursive = FALSE)))
 
- message("Required GSE: ", ngse_down)
\ No newline at end of file
+ message("Required GSE: ", ngse_down)
--- a/mapping_MCO/bin/__pycache__/format_fun.cpython-36.pyc deleted 100644 → 0
View file @afa29bd
+++ b/mapping_MCO/bin/__pycache__/format_fun.cpython-36.pyc deleted 100644 → 0
View file @afa29bd
--- a/mapping_MCO/bin/__pycache__/format_fun_v2.cpython-36.pyc deleted 100644 → 0
View file @afa29bd
+++ b/mapping_MCO/bin/__pycache__/format_fun_v2.cpython-36.pyc deleted 100644 → 0
View file @afa29bd
--- a/mapping_MCO/bin/__pycache__/format_fun_v6.cpython-36.pyc 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/bin/__pycache__/format_fun_v6.cpython-36.pyc 0 → 100644
View file @92c2d61
--- a/mapping_MCO/bin/format_fun.py
View file @92c2d61
+++ b/mapping_MCO/bin/format_fun.py
View file @92c2d61
@@ -42,6 +42,11 @@ def get_crossref_info(info_df):
   - **pmid**: PubMed ID
 """
 def get_cite_info(info_df):
+     if(info_df.CASE_MATCH == "ZIKA"):
+         cite_dict ={
+             "evidence_id":"",
+             "evidence_name":"ZIKA",
+             "pmid" : info_df.PMID}
     cite_dict ={        
         "evidence_id": "",
         "evidence_name" : "NPL-CRF", #NPL
@@ -49,6 +54,8 @@ def get_cite_info(info_df):
     }
     return(cite_dict)
 def get_description(info_df):
+     if(info_df.CASE_MATCH=="ZIKA"):
+         mco_mapping = {}
     if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
         mco_mapping = {
             "type": "term present on MCO"
--- a/mapping_MCO/bin/format_fun_v4.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/format_fun_v4.py 0 → 100755
View file @92c2d61
+ from numpy import nan
+ from collections import OrderedDict
+ from pandas import DataFrame as DF
+ """
+     - **name**:  nombre del termino registrado en la MCO
+     - **term_id**: identificador del termino en RegulonDB (si existe)
+     - **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
+     - **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
+     - **source**: fuente de los datos [ GEO,  ]
+     - **id**: identificador del registro de la base de datos o fuente de datos
+     - **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
+     - **associatedPhrase**: Frase de donde se tomo la informacion
+ """
+ def get_term_info(info_df, source):
+     term_dict = {        
+         "name": info_df.TERM_NAME, #NPL output
+         "term_id" : info_df.TERM_ID, #MCO
+         "term_type": info_df.TERM_TYPE, #NPL
+         "source_data": info_df.REPO_FILE, #NPL
+         "source":  source,
+         "id": info_df.GSM, #NPL
+         "field":  info_df.BANGLINE, #NPL
+         "associatedPhrase":  info_df.FULL_TEXT #NPL
+     }
+     return(term_dict)
+ 
+ 
+ """
+     - **objectId**: Identificador en la base de datos fuente
+     - **externalCrossReferences_name**: nombre de la DB [ GEO ]
+ """
+ def get_crossref_info(info_df, source):
+     crossref_dict ={        
+         "objectId": info_df.GSM, #NPL
+         "externalCrossReferences_name" : source
+     }
+     return(crossref_dict)
+ 
+ """
+   - **evidence_id**: Identificador de RegulondB asociado a la evidencia
+   - **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
+   - **pmid**: PubMed ID
+ """
+ def get_cite_info(info_df, esource):
+     cite_dict ={        
+         "evidence_id": "",
+         "evidence_name" : esource,
+         "pmid": info_df.PMID
+     }
+     return(cite_dict)
+ 
+ def get_description(info_df, no_map=False):
+     if(no_map):
+         mco_mapping = {
+             "type": "not present on MCO"
+         }
+     elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
+         mco_mapping = {
+             "type": "term present on MCO"
+             }
+     else:
+         mco_mapping = {
+             "type": "string similarity",
+             "score": info_df.SET
+         }
+     return(mco_mapping)
+ """
+ #run it in the main for each field
+ 
+ 
+ return: type
+ 
+ id: string
+ name: string
+ description: string
+ terms: list of dict
+ externalCrossReferences: list of dict
+ citations: list of dict
+ 
+ """
+ 
+ def created_record(term_info_df, source = "GEO", no_map = False, esource = "NPL-CRF"):
+     record_dict = OrderedDict()
+     term_info_df = term_info_df.replace(nan, '', regex=True)
+     record_dict["id"] = term_info_df.TERM_ID #it should be add if it have been mapped
+     record_dict["name"] = term_info_df.TERM_NAME #a colum form NPL output
+     record_dict["description"] = [get_description(term_info_df, no_map=no_map)]
+     record_dict["terms"] = [get_term_info(term_info_df, source)]
+     record_dict["externalCrossReferences"] = [get_crossref_info(term_info_df, source)]
+     record_dict["citations"] = [get_cite_info(term_info_df, esource)]    
+     
+     return(record_dict)
+ 
+ def json2DataFrame(data):
+     mco_syn_dic = dict()
+ 
+     for j,i in enumerate(data):    
+         if "regulondb_id" in i.keys():
+             
+             if "synonyms" in i.keys():
+                 
+                 for k,syn in enumerate(i['synonyms']):
+                     dict_key = i['regulondb_id']+"_"+str(k)
+                     mco_syn_dic[dict_key] = {
+                         #ENTITY_NAME
+                         'ENTITY_NAME' : i['name'], 
+                         #ENITY_SYN
+                         'TERM_NAME': syn.lower(),
+                         #regulondb_id
+                         'TERM_ID' : i['regulondb_id'] }
+ 
+             elif "hasRelatedSynonyms" in i.keys(): 
+ 
+                 for k,syn in enumerate(i['hasRelatedSynonyms']):
+                     dict_key = i['regulondb_id']+"_"+str(k)
+                     mco_syn_dic[dict_key] = {
+                         #ENTITY_NAME
+                         'ENTITY_NAME' : i['name'], 
+                         #ENITY_SYN
+                         'TERM_NAME': syn.lower(),
+                         #regulondb_id
+                         'TERM_ID' : i['regulondb_id'] }
+             else:
+                 dict_key = i['regulondb_id']
+                 mco_syn_dic[dict_key] = {
+                     #ENTITY_NAME
+                     'ENTITY_NAME' : i['name'], 
+                     #ENITY_SYN
+                     'TERM_NAME': '',
+                     #regulondb_id
+                     'TERM_ID' : i['regulondb_id'] }
+             
+     mco_syn_df = DF.from_dict(mco_syn_dic).T
+ 
+ 
+     return(mco_syn_df)
+ 
--- a/mapping_MCO/bin/format_fun_v6.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/format_fun_v6.py 0 → 100755
View file @92c2d61
+ from numpy import nan
+ #from collections import OrderedDict
+ from pandas import DataFrame as DF
+ import json
+ from collections import defaultdict
+ import format_fun_v6 as format_fun
+ 
+ def to_json(df, source_info, evidence_source, ofname):
+     df_terms = defaultdict(list)
+         
+     for idx,row in df.iterrows():
+         term_record = format_fun.get_term_info(
+             row, 
+             source = source_info, 
+             map= row.MAP)
+         df_terms[row.SRR].append(term_record)        
+     
+     df_json = {}
+     df_tmp = df.drop_duplicates("SRR", keep="first")
+     for idx,row in df_tmp.iterrows():        
+         srr_record = format_fun.created_record(
+             info_df = row, 
+             term_list = df_terms[row.SRR],
+             source = source_info,
+             esource = evidence_source)        
+         df_json[row.SRR] = srr_record
+ 
+     with open(ofname, "w") as output:
+         json.dump(df_json, output, separators=(',', ':'), indent=4)
+ 
+ def get_score(info_df):
+     if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
+         subtext = "term present on MCO"
+     else:
+         mco_mapping = {
+             "type": "string similarity",
+             "score": info_df.SET
+         }
+     return(mco_mapping)
+ """
+     - **name**:  nombre del termino registrado en la MCO
+     - **term_id**: identificador del termino en RegulonDB (si existe)
+     - **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
+     - **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
+     - **source**: fuente de los datos [ GEO,  ]
+     - **id**: identificador del registro de la base de datos o fuente de datos
+     - **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
+     - **associatedPhrase**: Frase de donde se tomo la informacion
+ """
+ def get_term_info(info_df, source, map=True):
+     info_df = info_df.replace(nan, "", regex=True)
+     term_dict = {        
+         "name": info_df.TERM_NAME, #NPL output
+         "term_id" : info_df.TERM_ID, #MCO
+         "term_type": info_df.TERM_TYPE, #NPL
+         "source_data": {
+             "source":  source,
+             "id": info_df.GSM, #NPL
+             "field":  info_df.BANGLINE, #NPL
+             "associatedPhrase":  info_df.FULL_TEXT, #NPL
+             "description" : get_description(info_df, map),
+             "similarity_percentage" : info_df.SET
+         }
+     }
+     return(term_dict)
+ 
+ 
+ """
+     - **objectId**: Identificador en la base de datos fuente
+     - **externalCrossReferences_name**: nombre de la DB [ GEO ]
+ """
+ def get_crossref_info(info_df, source):
+     crossref_dict ={        
+         "objectId": info_df.GSM, #NPL
+         "externalCrossReferences_name" : source
+     }
+     return(crossref_dict)
+ 
+ """
+   - **evidence_id**: Identificador de RegulondB asociado a la evidencia
+   - **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
+   - **pmid**: PubMed ID
+ """
+ def get_cite_info(info_df, esource):
+     cite_dict ={        
+         "evidence_id": "",
+         "evidence_name" : esource,
+         "pmid": info_df.PMID
+     }
+     return(cite_dict)
+ 
+ def get_description(info_df, map=True):
+     if not map:
+         subtext = "absent in RegulonDB MCO"
+     elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
+         subtext = "RegulonDB MCO term"
+     else:
+         subtext = "Similar term in RegulonDB MCO"
+     return(subtext)
+ """
+ #run it in the main for each field
+ 
+ 
+ return: type
+ 
+ id: string
+ name: string
+ description: string
+ terms: list of dict
+ externalCrossReferences: list of dict
+ citations: list of dict
+ 
+ """
+ 
+ def created_record(info_df, term_list, source = "GEO", esource = "NPL-CRF"):
+     #record_dict = OrderedDict()    
+     record_dict = {}
+     info_df = info_df.replace(nan, "", regex=True)
+     record_dict["id"] = ""
+     record_dict["name"] = ""
+     record_dict["description"] = ""
+     record_dict["terms"] = term_list
+     record_dict["externalCrossReferences"] = [get_crossref_info(info_df, source)]
+     record_dict["citations"] = [get_cite_info(info_df, esource)]    
+     
+     return(record_dict)
+ 
+ def json2DataFrame(data):
+     mco_syn_dic = dict()
+ 
+     for j,i in enumerate(data):    
+         if "regulondb_id" in i.keys():
+             
+             if "synonyms" in i.keys():
+                 
+                 for k,syn in enumerate(i['synonyms']):
+                     dict_key = i['regulondb_id']+"_"+str(k)
+                     mco_syn_dic[dict_key] = {
+                         #ENTITY_NAME
+                         'ENTITY_NAME' : i['name'], 
+                         #ENITY_SYN
+                         'TERM_NAME': syn.lower(),
+                         #regulondb_id
+                         'TERM_ID' : i['regulondb_id'] }
+ 
+             elif "hasRelatedSynonyms" in i.keys(): 
+ 
+                 for k,syn in enumerate(i['hasRelatedSynonyms']):
+                     dict_key = i['regulondb_id']+"_"+str(k)
+                     mco_syn_dic[dict_key] = {
+                         #ENTITY_NAME
+                         'ENTITY_NAME' : i['name'], 
+                         #ENITY_SYN
+                         'TERM_NAME': syn.lower(),
+                         #regulondb_id
+                         'TERM_ID' : i['regulondb_id'] }
+             else:
+                 dict_key = i['regulondb_id']
+                 mco_syn_dic[dict_key] = {
+                     #ENTITY_NAME
+                     'ENTITY_NAME' : i['name'], 
+                     #ENITY_SYN
+                     'TERM_NAME': '',
+                     #regulondb_id
+                     'TERM_ID' : i['regulondb_id'] }
+             
+     mco_syn_df = DF.from_dict(mco_syn_dic).T
+ 
+ 
+     return(mco_syn_df)
+ 
--- a/mapping_MCO/bin/format_zika_v3.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/format_zika_v3.py 0 → 100755
View file @92c2d61
+ # -*- coding: utf-8 -*-
+ """
+ #Setup
+ """
+ 
+ #################### Setup ####################
+ from collections import defaultdict
+ from optparse import OptionParser
+ import os
+ from numpy.core.fromnumeric import sort
+ from pandas import read_csv, DataFrame, merge, concat, read_table
+ from numpy import exp, nan
+ import seaborn as sns
+ from numpy import mean
+  
+ import matplotlib.pyplot as plt 
+ import matplotlib
+ matplotlib.style.use('ggplot')
+ # %matplotlib inline
+ 
+ from collections import Counter
+ import json
+ 
+ from fuzzywuzzy import fuzz
+ from fuzzywuzzy import process
+ 
+ import format_fun
+ import mapping_fun
+ import sys
+ 
+ """
+ # input parameters
+ --inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+ --iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
+ --iOntoFile         gc_ontology_terms_v2.txt
+ --iLinksFile        gc_ontology_terms_link_v2.txt  
+ --iSynFile          mco_terms_v0.2.json
+ --outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
+ --outputFile        all_srr_IV_mapped.tsv
+ --minPerMatch       90
+ 
+ 
+ #Example
+ # nohup python3 mapping2MCO_v3.py --inputPath      /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile      gc_ontology_terms_v2.txt   --iSynFile       mco_terms_v0.2.json --outputPath     /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile     srr_htregulondb_mapped.tsv --minPerMatch  80  --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
+ """
+ #################### Defining parameters ####################
+ if __name__ == "__main__":
+     parser = OptionParser()
+     parser.add_option(
+         "--inputPath",
+         dest="input_path",
+         help="Path of npl tagged file (crf output)",
+         metavar="PATH")
+     parser.add_option(
+          "--iAnnotatedFile",
+         dest="npl_fname",
+         help="Input file of npl tagged file (crf output)",
+         metavar="FILE",
+         default="")
+     parser.add_option(
+          "--iOntoFile",
+         dest="onto_fname",
+         help="Input file with the ontology entities",
+         metavar="FILE",
+         default="")
+     parser.add_option(
+          "--iLinksFile",
+         dest="links_fname",
+         help="Input file with links and id for the ontology",
+         metavar="FILE",
+         default=None)
+     parser.add_option(
+          "--iSynFile",
+         dest="syn_fname",
+         help="Input file for the additional ontology of synonyms",
+         metavar="FILE",
+         default=None)
+     parser.add_option(
+          "--outputPath",
+         dest="output_path",
+         help="Output path to place output files",
+         metavar="PATH")
+     parser.add_option(
+          "--outputFile",
+         dest="out_fname",
+         help="Output file name for the mapping process",
+         metavar="FILE",
+         default="")
+     parser.add_option(
+          "--minPerMatch",
+         dest="min_score",
+         help="Minimal string matching percentage")  
+     parser.add_option(
+          "--minCRFProbs",
+         dest="min_probs",
+         help="Minimal crf probabilities")
+     
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     #################### DISP PARAMETERS ####################
+     print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
+     print("--inputPath      Path of npl tagged file: " + str(options.input_path))
+     print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
+     print("--iOntoFile      Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
+     print("--iLinksFile     Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
+     print("--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
+     print("--outputPath     Output path to place output files: " + str(options.output_path))
+     print("--outputFile     Output of the mapping process: " + str(options.out_fname))
+     print("--minPerMatch    Minimal string matching percentage: " + str(options.min_score))
+     print("--minCRFProbs    Minimal crf probabilities allowed: " + str(options.min_probs))
+ 
+     print("\n\n")
+     repognrl = "http://pakal.ccg.unam.mx/cmendezc"
+     reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
+     repo_url = '/'.join([repognrl,reponame])
+     
+     # Input files
+     min_score = int(options.min_score)
+     min_probs = float(options.min_probs)
+     npl_ifile =  os.path.join(options.input_path, options.npl_fname)
+     mco_ifile =  os.path.join(options.input_path, options.onto_fname)
+     mco_syn_ifile =  os.path.join(options.input_path, options.syn_fname)
+ 
+     #Output files
+     raw_ofname = "_".join(["raw", options.out_fname])
+     rawmap_ofile =  os.path.join(options.output_path, raw_ofname)
+     str_ofname = "_".join(["sim", options.out_fname])
+     strmap_ofile =  os.path.join(options.output_path, str_ofname)
+ 
+     full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
+     full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
+ 
+     json_ofile = os.path.join(options.output_path, options.out_fname)
+     json_ofile_map  = json_ofile.replace(".tsv", "_map.json")
+     json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
+     json_ofile_list= json_ofile.replace(".tsv", "_list.json")
+     json_ofile_df_list= json_ofile.replace(".tsv", "_df_list.json")
+     
+     #################### Load input data ####################
+     # Load CRF-annotation
+     exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
+     npl_full = read_table(npl_ifile,  sep = "\t")
+         
+     obs_cols = set(npl_full.columns)
+ 
+     if exp_cols.intersection(obs_cols) != exp_cols:
+         ocol = ", ".join(list(exp_cols))        
+         sys.exit(ocol + " expected columns for iAnnotatedFile" )
+ 
+     #Load MCO term names
+     exp_cols = {"TERM_ID", "TERM_NAME"}
+     mco_df_full = read_table(mco_ifile,  sep = "\t")
+     obs_cols = set(mco_df_full.columns)
+ 
+     if exp_cols.intersection(obs_cols) != exp_cols:
+         sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
+         
+     mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
+     mco_df = mco_df.drop_duplicates(keep="first")
+     mco_df = mco_df.dropna()
+ 
+     #Load MCO links
+     if options.links_fname is not None:
+         print("\nLoad types...")
+         mcolink_ifile =  os.path.join(options.input_path, options.links_fname)
+         exp_cols = {"TERM_ID", "TERM_TYPE"}
+         mco_links_full = read_table(mcolink_ifile, sep = "\t")
+ 
+         obs_cols = set(mco_links_full.columns)
+ 
+         if exp_cols.intersection(obs_cols) != exp_cols:
+             sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
+ 
+         mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
+         mco_links = mco_links.drop_duplicates(keep="first")
+         mco_links = mco_links.dropna()
+     else:
+         mco_links = None
+ 
+     #Load MCO terms synonyms
+     #format json from mco to dataframe
+     mco_json = open(mco_syn_ifile )
+     data = json.load(mco_json)
+     mco_syn = format_fun.json2DataFrame(data)
+     
+     df_json = defaultdict(list)
+         
+     for idx,row in npl_full.iterrows():
+         record = format_fun.created_record(row)
+         df_json[row.SRR].append(record)
+     
+     df_json
+     with open(json_ofile_list, "w") as output:
+         json.dump(format_fun.created_record(df_json), output)
+ 
+     with open(json_ofile_df_list, "a") as output:
+         for idx,row in df_json.items():
+             json.dump(format_fun.created_record(row), output)
\ No newline at end of file
--- a/mapping_MCO/bin/format_zika_v4.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/format_zika_v4.py 0 → 100755
View file @92c2d61
+ # -*- coding: utf-8 -*-
+ """
+ #Setup
+ """
+ 
+ #################### Setup ####################
+ from collections import defaultdict
+ from optparse import OptionParser
+ import os
+ from pandas import read_csv, DataFrame, merge, concat, read_table
+ from numpy import exp, nan, mean
+ import json
+ import format_fun_v4 as format_fun
+ import sys
+ 
+ """
+ # input parameters
+ --inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+ --iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
+ --outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
+ --outputFile        all_srr_IV_mapped.tsv
+ 
+ 
+ #Example
+ # python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v4.py --inputPath  /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v3.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile  zika_mapped.json
+ """
+ #################### Defining parameters ####################
+ if __name__ == "__main__":
+     parser = OptionParser()
+     parser.add_option(
+         "--inputPath",
+         dest="input_path",
+         help="Path of npl tagged file (crf output)",
+         metavar="PATH")
+     parser.add_option(
+          "--iAnnotatedFile",
+         dest="npl_fname",
+         help="Input file of npl tagged file (crf output)",
+         metavar="FILE",
+         default="")
+     parser.add_option(
+          "--outputPath",
+         dest="output_path",
+         help="Output path to place output files",
+         metavar="PATH")
+     parser.add_option(
+          "--outputFile",
+         dest="out_fname",
+         help="Output file name for the mapping process",
+         metavar="FILE",
+         default="")
+     
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     #################### DISP PARAMETERS ####################
+     print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
+     print("--inputPath      Path of npl tagged file: " + str(options.input_path))
+     print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
+     print("--outputPath     Output path to place output files: " + str(options.output_path))
+     print("--outputFile     Output of the mapping process: " + str(options.out_fname))    
+ 
+     print("\n\n")
+     
+     # Input files        
+     npl_ifile =  os.path.join(options.input_path, options.npl_fname)
+ 
+     #Output files
+     ofname = os.path.join(options.output_path, options.out_fname)
+     
+     #################### Load input data ####################
+     # Load CRF-annotation
+     exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
+     npl_full = read_table(npl_ifile,  sep = "\t")
+         
+     obs_cols = set(npl_full.columns)
+ 
+     if exp_cols.intersection(obs_cols) != exp_cols:
+         ocol = ", ".join(list(exp_cols))        
+         sys.exit(ocol + " expected columns for iAnnotatedFile" )
+     
+     df_json = defaultdict(list)
+         
+     for idx,row in npl_full.iterrows():
+         record = format_fun.created_record(row, source = "ZIKAdb", no_map = True, esource = "database")
+         if(idx<2): print(record)
+         #record_json = json.dumps(record)
+         record_json = record
+         df_json[row.SRR].append(record_json)
+ 
+     """
+     with open(ofname, "a") as output:
+         output.write("field:[")
+         sep=""
+         for k,v in df_json.items():            
+             output.write(sep)
+             json.dump(v, output)            
+             sep=","
+         output.write("]")
+ 
+     """
+     with open(ofname, "a") as output:
+         output.write("{")
+         sep=""
+         for k,v in df_json.items():            
+             output.write(sep)
+             output.write("\""+k+"\"")
+             output.write(":")
+             record_list = {                
+                 "growth_conditions": df_json[k]
+             }
+             json.dump(record_list, output)            
+             sep=","
+         output.write("}")
+     
+     df=open(ofname)
+     df=json.load(df)
+         
--- a/mapping_MCO/bin/format_zika_v5.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/format_zika_v5.py 0 → 100755
View file @92c2d61
+ # -*- coding: utf-8 -*-
+ """
+ #Setup
+ """
+ 
+ #################### Setup ####################
+ from optparse import OptionParser
+ import os
+ from pandas import read_csv, DataFrame, merge, concat, read_table
+ from numpy import mean
+ import format_fun_v6 as format_fun
+ import sys
+ 
+ """
+ # input parameters
+ --inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+ --iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
+ --outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
+ --outputFile        all_srr_IV_mapped.tsv
+ 
+ 
+ #Example
+ # python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath  /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile  zika.json > automatic-extraction-growth-conditions/mapping_MCO/reports/zika_formated_report.out
+ # python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath  /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/test/ --outputFile  zika_mapped.json > automatic-extraction-growth-conditions/mapping_MCO/test/zika_mapping_report.out
+ 
+ """
+ #################### Defining parameters ####################
+ if __name__ == "__main__":
+     parser = OptionParser()
+     parser.add_option(
+         "--inputPath",
+         dest="input_path",
+         help="Path of npl tagged file (crf output)",
+         metavar="PATH")
+     parser.add_option(
+          "--iAnnotatedFile",
+         dest="npl_fname",
+         help="Input file of npl tagged file (crf output)",
+         metavar="FILE",
+         default="")
+     parser.add_option(
+          "--outputPath",
+         dest="output_path",
+         help="Output path to place output files",
+         metavar="PATH")
+     parser.add_option(
+          "--outputFile",
+         dest="out_fname",
+         help="Output file name for the mapping process",
+         metavar="FILE",
+         default="")
+     
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     #################### DISP PARAMETERS ####################
+     print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
+     print("--inputPath      Path of npl tagged file: " + str(options.input_path))
+     print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
+     print("--outputPath     Output path to place output files: " + str(options.output_path))
+     print("--outputFile     Output of the mapping process: " + str(options.out_fname))    
+ 
+     print("\n\n")
+     
+     # Input files        
+     npl_ifile =  os.path.join(options.input_path, options.npl_fname)
+ 
+     #Output files
+     ofname = os.path.join(options.output_path, options.out_fname)
+     
+     #################### Load input data ####################
+     # Load CRF-annotation
+     exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
+     npl_full = read_table(npl_ifile,  sep = "\t")
+     npl_full = npl_full.drop_duplicates()
+ 
+     print(f"Total zika terms: {len(npl_full)} ")
+     obs_cols = set(npl_full.columns)
+ 
+     if exp_cols.intersection(obs_cols) != exp_cols:
+         ocol = ", ".join(list(exp_cols))        
+         sys.exit(ocol + " expected columns for iAnnotatedFile" )
+     """
+     df_terms = defaultdict(list)
+         
+     for idx,row in npl_full.iterrows():
+         term_record = format_fun.get_term_info(row, source = "ZIKAdb", map=False)        
+         df_terms[row.SRR].append(term_record)        
+     
+     df_json = {}
+     df_tmp = npl_full.drop_duplicates("SRR", keep="first")
+     for idx,row in df_tmp.iterrows():        
+         srr_record = format_fun.created_record(
+             info_df = row, 
+             term_list = df_terms[row.SRR],
+             source = "ZIKAdb",             
+             esource = "database")        
+         df_json[row.SRR] = srr_record
+ 
+     with open(ofname, "w") as output:
+         json.dump(df_json, output, separators=(',', ':'), indent=4)
+ 
+     df=open(ofname)
+     df=json.load(df)
+     print(df["ERR1399578"])
+     """
+     npl_full["MAP"] = False
+     format_fun.to_json(
+         df = npl_full,        
+         source_info = "ZIKAdb",
+         evidence_source = "database", 
+         ofname = ofname)
\ No newline at end of file
--- a/mapping_MCO/bin/mapping2MCO_v3.py
View file @92c2d61
+++ b/mapping_MCO/bin/mapping2MCO_v3.py
View file @92c2d61
@@ -23,7 +23,8 @@ import json
 from fuzzywuzzy import fuzz
 from fuzzywuzzy import process
 
- import format_fun
+ #import format_fun
+ import format_fun_v4 as format_fun
 import mapping_fun
 import sys
 
@@ -338,5 +339,6 @@ if __name__ == "__main__":
     with open(json_ofile_unmap, "a") as output:
         for idx,row in full_unmap.iterrows():
             json.dump(format_fun.created_record(row), output)
+         
+     
     
-     
\ No newline at end of file
--- a/mapping_MCO/bin/mapping2MCO_v4.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/mapping2MCO_v4.py 0 → 100755
View file @92c2d61
--- a/mapping_MCO/bin/mapping2MCO_v5.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/mapping2MCO_v5.py 0 → 100755
View file @92c2d61
--- a/mapping_MCO/bin/mapping2MCO_v6.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/mapping2MCO_v6.py 0 → 100755
View file @92c2d61
--- a/mapping_MCO/bin/step1_filter_curated_terms.py 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/bin/step1_filter_curated_terms.py 0 → 100644
View file @92c2d61
+ from pandas import read_csv, merge
+ crf_output_file = "/home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/srr_htregulondb/srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv"
+ annot_file = "/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/correct_gc_terms_07_rev_Victor.csv"
+ filter_ofile = "/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv"
+ 
+ annot = read_csv(annot_file, names = ["TERM_TYPE", "TERM_NAME"] )
+ annot.TERM_NAME = [text.strip() for text in annot.TERM_NAME]
+ crf_ouput = read_csv(crf_output_file, 
+                      names = ["SRR","GSE","GSM","GPL","PMID","GSM_NAME",
+                               "GSE_NAME","GPL_NAME","BANGLINE",
+                               "SOURCE_TEXT_CTRL","FULL_TEXT","TERM_NAME",
+                               "TERM_TYPE","PROB"], 
+                      sep = "\t")
+ crf_ouput.TERM_NAME = [text.strip() for text in crf_ouput.TERM_NAME]
+ 
+ paso1 = merge( annot,crf_ouput, on =  ["TERM_TYPE", "TERM_NAME"] )
+ paso1 = paso1.reindex(columns = ["SRR","GSE","GSM","GPL","PMID","GSM_NAME",
+                               "GSE_NAME","GPL_NAME","BANGLINE",
+                               "SOURCE_TEXT_CTRL","FULL_TEXT","TERM_NAME",
+                               "TERM_TYPE","PROB"])
+ 
+ paso1.to_csv(filter_ofile,sep="\t",index=False,header=True)
--- a/mapping_MCO/input/No_GSM_Metadata_Selected_v4.tsv 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/input/No_GSM_Metadata_Selected_v4.tsv 0 → 100755
View file @92c2d61
--- a/mapping_MCO/input/correct_gc_terms_07_rev_Victor.csv 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/input/correct_gc_terms_07_rev_Victor.csv 0 → 100755
View file @92c2d61
+ GC_type,GC_term
+ Agit,250 rpm
+ Air,Aerobic
+ Air,Aerobic and anaerobic
+ Air,aerobically
+ Air,anaerobic
+ Gtype,{ delta } baeR
+ Gtype,{ delta } cpxR
+ Gtype,{ delta } cspABCEG
+ Gtype,{ delta } cspABEG
+ Gtype,{ delta } cspBG
+ Gtype,{ delta } hns
+ Gtype,{ delta } kdpE
+ Gtype,{ delta } nusG
+ Gtype,{ delta } perC : : kanR
+ Gtype,{ delta } phoB
+ Gtype,{ delta } rho
+ Gtype,{ delta } rnr
+ Gtype,{ delta } zraR
+ Gtype,: φO104
+ Gtype,DH5α ( pAR060302 )
+ Gtype,E.coli K12 BW25113
+ Gtype,K12 MG1655
+ Gtype,K12 MG1655 deltaprfC
+ Gtype,K12 MG1655 prfB-Bstrain allele
+ Gtype,K12 MG1655 prfB-Bstrain allele deltaprfC
+ Gtype,K12 MG1657
+ Gtype,K12 MG1667
+ Gtype,K12 MG1668
+ Gtype,K12 MG1672
+ Gtype,K12 MG1673
+ Gtype,K12 MG1674
+ Gtype,K12 W3110
+ Gtype,MC4100 ∆ tig : : kan pTig-TEV-Avi
+ Gtype,O157 : H7 NCTC 12900
+ Gtype,PNPase mutant
+ Gtype,Pck over-expressed
+ Gtype,RNase II mutant
+ Gtype,RNase R mutant
+ Gtype,W3110 6xHis-rpoD
+ Gtype,W3110 6xHis-rpoD greA : : tet greB : : amp
+ Gtype,"W3110 rpoC-6xHis : : kan greA : : tet , greB : : amp"
+ Gtype,WT
+ Gtype,WT WT
+ Gtype,Wild type
+ Gtype,Wild-type
+ Gtype,Wildtype
+ Gtype,cra KO ; BW25113 Dcra
+ Gtype,cya mutant background
+ Gtype,delta Crp
+ Gtype,delta _ cra
+ Gtype,delta-gadE
+ Gtype,delta-gadW
+ Gtype,delta-gadX
+ Gtype,delta-oxyR
+ Gtype,delta-soxR
+ Gtype,delta-soxS
+ Gtype,fepA knockout
+ Gtype,fis mutant background
+ Gtype,lacA knockout
+ Gtype,lack Fis protein
+ Gtype,lack H-NS protein
+ Gtype,naive ( wild type )
+ Gtype,ompR deletion mutant
+ Gtype,phageO104 in the wrbA gene
+ Gtype,phagePA8 in the argW gene
+ Gtype,rng mutant
+ Gtype,sdhC knockout
+ Gtype,sigma70 WT
+ Gtype,wild type
+ Gtype,wild type ; MG1655
+ Gtype,wild-type
+ Gtype,wildtype
+ Gtype,wt
+ Gtype,yafC deletion
+ Gtype,ybaO deletion
+ Gtype,ybaQ deletion
+ Gtype,ybiH deletion
+ Gtype,ydcI deletion
+ Gtype,yddM deletion
+ Gtype,yeiE deletion
+ Gtype,yheO deletion
+ Gtype,yiaJ deletion
+ Gtype,yieP deletion
+ Gtype,Δcra
+ Gtype,Δfur
+ Gtype,ΔgadE
+ Gtype,ΔgadW
+ Gtype,ΔgadX
+ Gtype,ΔoxyR
+ Gtype,ΔsoxR
+ Gtype,ΔsoxS
+ Gtype,∆ cspABCEG
+ Gtype,∆ cspABEG
+ Gtype,∆ cspBG
+ Gtype,∆ hfq : : cat )
+ Gtype,∆ rnr
+ Med,Bertani ( LB ) medium
+ Med,Davis Minimal medium
+ Med,LB
+ Med,LB media
+ Med,LB medium
+ Med,"LB medium ,"
+ Med,M9 + 4 g/L glc ( glucose minimal media )
+ Med,M9 minimal media
+ Med,M9 minimal medium
+ Med,MOPS complete-glucose liquid media
+ Med,MOPS glucose minimal medium
+ Med,MOPS medium
+ Med,Neidhardt MOPS Minimal Medium ( NM3 )
+ Med,SB medium
+ Med,SILAC
+ Med,W2 minimal media
+ Med,fresh DM500
+ Med,fully supplemented MOPS glucose media
+ Med,glucose-M9 minimal media
+ Med,glucose-limited minimal medium
+ Med,in fresh LB medium
+ Med,minimal medium
+ OD,O.D. 600nm 0.5
+ OD,OD600 = 0.3
+ OD,OD600 of about 0.8
+ Phase,IspG1 strain
+ Phase,exponential
+ Phase,log phase
+ Phase,log phase sample
+ Phase,mid-log phase
+ Phase,stationary
+ Phase,stationary phase
+ Supp,0.1 mM KCl
+ Supp,0.2 % arabinose
+ Supp,0.2 % glucose
+ Supp,0.2 % glutamine
+ Supp,0.2 mM of DPD
+ Supp,0.3 % glucose
+ Supp,0.3 M of NaCl
+ Supp,0.4 % glucose
+ Supp,0.5 % glucose
+ Supp,100 μM IPTG
+ Supp,1mM IPTG
+ Supp,2 mM Hydrogen peroxide
+ Supp,22 mM glucose
+ Supp,250 uM of paraquat
+ Supp,2g/L glucose
+ Supp,2g/L glucose and 1 mM cytidine
+ Supp,4g/L glucose
+ Supp,50 µM NiCl2
+ Supp,70 µM IPTG
+ Supp,DPD
+ Supp,Fe
+ Supp,IPTG
+ Supp,IPTG was
+ Supp,L-trp
+ Supp,Xgal and IPTG
+ Supp,acetate
+ Supp,ade
+ Supp,arabinose
+ Supp,fructose
+ Supp,glucose
+ Supp,glutamine
+ Supp,induced 50 µM IPTG
+ Supp,mM IPTG
+ Supp,mM IPTG + 50μg/ml Amp
+ Supp,rhamnose
+ Supp,rifampicin
+ Supp,rifampicin and
+ Supp,rifampicin time point
+ Supp,rifampicin time point 0
+ Supp,rifampicin time point 4
+ Supp,rifampicin time point 6
+ Supp,rifampicin time point 8
+ Temp,10 °C
+ Temp,30 °C
+ Temp,37 °C
+ Temp,37 ℃
+ Temp,42 °C
+ pH,pH 5.5
+ pH,pH5 .5
--- a/mapping_MCO/input/format_zika_v3.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/input/format_zika_v3.py 0 → 100755
View file @92c2d61
+ # -*- coding: utf-8 -*-
+ """
+ #Setup
+ """
+ 
+ #################### Setup ####################
+ from collections import defaultdict
+ from optparse import OptionParser
+ import os
+ from numpy.core.fromnumeric import sort
+ from pandas import read_csv, DataFrame, merge, concat, read_table
+ from numpy import exp, nan
+ import seaborn as sns
+ from numpy import mean
+  
+ import matplotlib.pyplot as plt 
+ import matplotlib
+ matplotlib.style.use('ggplot')
+ # %matplotlib inline
+ 
+ from collections import Counter
+ import json
+ 
+ from fuzzywuzzy import fuzz
+ from fuzzywuzzy import process
+ 
+ import format_fun
+ import mapping_fun
+ import sys
+ 
+ """
+ # input parameters
+ --inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+ --iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
+ --iOntoFile         gc_ontology_terms_v2.txt
+ --iLinksFile        gc_ontology_terms_link_v2.txt  
+ --iSynFile          mco_terms_v0.2.json
+ --outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
+ --outputFile        all_srr_IV_mapped.tsv
+ --minPerMatch       90
+ 
+ 
+ #Example
+ # nohup python3 mapping2MCO_v3.py --inputPath      /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile      gc_ontology_terms_v2.txt   --iSynFile       mco_terms_v0.2.json --outputPath     /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile     srr_htregulondb_mapped.tsv --minPerMatch  80  --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
+ """
+ #################### Defining parameters ####################
+ if __name__ == "__main__":
+     parser = OptionParser()
+     parser.add_option(
+         "--inputPath",
+         dest="input_path",
+         help="Path of npl tagged file (crf output)",
+         metavar="PATH")
+     parser.add_option(
+          "--iAnnotatedFile",
+         dest="npl_fname",
+         help="Input file of npl tagged file (crf output)",
+         metavar="FILE",
+         default="")
+     parser.add_option(
+          "--iOntoFile",
+         dest="onto_fname",
+         help="Input file with the ontology entities",
+         metavar="FILE",
+         default="")
+     parser.add_option(
+          "--iLinksFile",
+         dest="links_fname",
+         help="Input file with links and id for the ontology",
+         metavar="FILE",
+         default=None)
+     parser.add_option(
+          "--iSynFile",
+         dest="syn_fname",
+         help="Input file for the additional ontology of synonyms",
+         metavar="FILE",
+         default=None)
+     parser.add_option(
+          "--outputPath",
+         dest="output_path",
+         help="Output path to place output files",
+         metavar="PATH")
+     parser.add_option(
+          "--outputFile",
+         dest="out_fname",
+         help="Output file name for the mapping process",
+         metavar="FILE",
+         default="")
+     parser.add_option(
+          "--minPerMatch",
+         dest="min_score",
+         help="Minimal string matching percentage")  
+     parser.add_option(
+          "--minCRFProbs",
+         dest="min_probs",
+         help="Minimal crf probabilities")
+     
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     #################### DISP PARAMETERS ####################
+     print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
+     print("--inputPath      Path of npl tagged file: " + str(options.input_path))
+     print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
+     print("--iOntoFile      Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
+     print("--iLinksFile     Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
+     print("--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
+     print("--outputPath     Output path to place output files: " + str(options.output_path))
+     print("--outputFile     Output of the mapping process: " + str(options.out_fname))
+     print("--minPerMatch    Minimal string matching percentage: " + str(options.min_score))
+     print("--minCRFProbs    Minimal crf probabilities allowed: " + str(options.min_probs))
+ 
+     print("\n\n")
+     repognrl = "http://pakal.ccg.unam.mx/cmendezc"
+     reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
+     repo_url = '/'.join([repognrl,reponame])
+     
+     # Input files
+     min_score = int(options.min_score)
+     min_probs = float(options.min_probs)
+     npl_ifile =  os.path.join(options.input_path, options.npl_fname)
+     mco_ifile =  os.path.join(options.input_path, options.onto_fname)
+     mco_syn_ifile =  os.path.join(options.input_path, options.syn_fname)
+ 
+     #Output files
+     raw_ofname = "_".join(["raw", options.out_fname])
+     rawmap_ofile =  os.path.join(options.output_path, raw_ofname)
+     str_ofname = "_".join(["sim", options.out_fname])
+     strmap_ofile =  os.path.join(options.output_path, str_ofname)
+ 
+     full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
+     full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
+ 
+     json_ofile = os.path.join(options.output_path, options.out_fname)
+     json_ofile_map  = json_ofile.replace(".tsv", "_map.json")
+     json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
+     
+     #################### Load input data ####################
+     # Load CRF-annotation
+     exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
+     npl_full = read_table(npl_ifile,  sep = "\t")
+         
+     obs_cols = set(npl_full.columns)
+ 
+     if exp_cols.intersection(obs_cols) != exp_cols:
+         ocol = ", ".join(list(exp_cols))        
+         sys.exit(ocol + " expected columns for iAnnotatedFile" )
+ 
+     #Load MCO term names
+     exp_cols = {"TERM_ID", "TERM_NAME"}
+     mco_df_full = read_table(mco_ifile,  sep = "\t")
+     obs_cols = set(mco_df_full.columns)
+ 
+     if exp_cols.intersection(obs_cols) != exp_cols:
+         sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
+         
+     mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
+     mco_df = mco_df.drop_duplicates(keep="first")
+     mco_df = mco_df.dropna()
+ 
+     #Load MCO links
+     if options.links_fname is not None:
+         print("\nLoad types...")
+         mcolink_ifile =  os.path.join(options.input_path, options.links_fname)
+         exp_cols = {"TERM_ID", "TERM_TYPE"}
+         mco_links_full = read_table(mcolink_ifile, sep = "\t")
+ 
+         obs_cols = set(mco_links_full.columns)
+ 
+         if exp_cols.intersection(obs_cols) != exp_cols:
+             sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
+ 
+         mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
+         mco_links = mco_links.drop_duplicates(keep="first")
+         mco_links = mco_links.dropna()
+     else:
+         mco_links = None
+ 
+     #Load MCO terms synonyms
+     #format json from mco to dataframe
+     mco_json = open(mco_syn_ifile )
+     data = json.load(mco_json)
+     mco_syn = format_fun.json2DataFrame(data)
+     
+     df_json = defaultdict(list)
+         
+     for idx,row in full_unmap.iterrows():
+         record = format_fun.created_record(row), output)
+         df_json[row.SRR].append(record)
+     
+     df_json
+     with open(json_ofile_list, "w") as output:
+         json.dump(format_fun.created_record(df_json), output)
+ 
+     with open(json_ofile_df_list, "a") as output:
+         for idx,row in df_json.items():
+             json.dump(format_fun.created_record(row), output)
\ No newline at end of file
--- a/mapping_MCO/input/srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/input/srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/srr_htregulondb_mapped_map.json deleted 100644 → 0
View file @afa29bd
+++ b/mapping_MCO/output/srr_htregulondb_mapped_map.json deleted 100644 → 0
View file @afa29bd
--- a/mapping_MCO/output/srr_htregulondb_mapped_unmap.json deleted 100644 → 0
View file @afa29bd
+++ b/mapping_MCO/output/srr_htregulondb_mapped_unmap.json deleted 100644 → 0
View file @afa29bd
--- a/mapping_MCO/output/full_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/full_map_srr_htregulondb.tsv
View file @92c2d61
+++ b/mapping_MCO/output/full_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/full_map_srr_htregulondb.tsv
View file @92c2d61
--- a/mapping_MCO/output/full_unmap_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/full_unmap_srr_htregulondb.tsv
View file @92c2d61
+++ b/mapping_MCO/output/full_unmap_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/full_unmap_srr_htregulondb.tsv
View file @92c2d61
--- a/mapping_MCO/output/sim_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/sim_srr_htregulondb.tsv
View file @92c2d61
+++ b/mapping_MCO/output/sim_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/sim_srr_htregulondb.tsv
View file @92c2d61
--- a/mapping_MCO/output/v3.1/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3.1/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3.1/srr_htregulondb_mapping_report.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/srr_htregulondb_mapping_report.out 0 → 100644
View file @92c2d61
+ /usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
+   warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
+ /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning: 
+ A value is trying to be set on a copy of a slice from a DataFrame.
+ Try using .loc[row_indexer,col_indexer] = value instead
+ 
+ See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+   str_matches_odf["SOURCE"] = mco_ifile
+ 
+ 
+ -------------------------------- PARAMETERS --------------------------------
+ 
+ --inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+ --iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
+ --iOntoFile      Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
+ --iLinksFile     Input file with links and id for the ontology (MCO-type-links): None
+ --iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
+ --outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
+ --outputFile     Output of the mapping process: srr_htregulondb.tsv
+ --minPerMatch    Minimal string matching percentage: 80
+ --minCRFProbs    Minimal crf probabilities allowed: 0.9
+ 
+ 
+ 
+ 
+ 
+ -------------------------------- INPUTS --------------------------------
+ 
+ 
+ npl tagged file
+ 
+           SRR                        ...                                                                  REPO_FILE
+ 0  SRR5742248                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 5  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 7  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 
+ [3 rows x 15 columns]
+ 
+ ontology entities
+ 
+         TERM_ID                         TERM_NAME
+ 0  MCO000000014  generically dependent continuant
+ 1  MCO000000015                         radiation
+ 2  MCO000000016         electromagnetic radiation
+ 
+ additional ontology of synonyms (MCO-syn-json)
+ 
+                    ENTITY_NAME       TERM_ID       TERM_NAME
+ MCO000000019        continuant  MCO000000019                
+ MCO000002475    culture medium  MCO000002475                
+ MCO000002467_0        Organism  MCO000002467  biologicentity
+ 
+ 
+ -------------------------------- RESULTS --------------------------------
+ 
+ 
+ Tracking exact terms to MCO...
+ 
+ Mapping 4099 terms to MCO based on exact strings...
+ 
+ Mapping 3770 terms to MCO - synonyms based on exact strings...
+ 
+ Total of terms mapped by exact strings: 387
+ Saving filtered terms from raw mapping...
+ 
+ 
+ 3712 unmapped terms based on exact strings
+ Dropping duplicated unmapped term names...
+ 206 unmapped unique terms based on exact strings
+ 
+ compute string similarty...
+ 
+ Mapping to MCO 206 terms based on string similarity...
+ 
+ Mapping to MCO - synonyms 152 terms based on string siilarity..
+ 
+ Unique terms mapped by string similarity: 73
+ Total of terms mapped by string similarity: 1992
+ Saving filtered terms from str mapping...
+ 
+ 
+ --------------------END----------------------
+ Total of terms mapped: 2379
+ 
+ Total of terms unmapped: 1720
--- a/mapping_MCO/output/v3.1/srr_htregulondb_mapping_report_v3.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/srr_htregulondb_mapping_report_v3.out 0 → 100644
View file @92c2d61
+ /usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
+   warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
+ /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning: 
+ A value is trying to be set on a copy of a slice from a DataFrame.
+ Try using .loc[row_indexer,col_indexer] = value instead
+ 
+ See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+   str_matches_odf["SOURCE"] = mco_ifile
+ 
+ 
+ -------------------------------- PARAMETERS --------------------------------
+ 
+ --inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+ --iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
+ --iOntoFile      Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
+ --iLinksFile     Input file with links and id for the ontology (MCO-type-links): None
+ --iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
+ --outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
+ --outputFile     Output of the mapping process: srr_htregulondb.tsv
+ --minPerMatch    Minimal string matching percentage: 80
+ --minCRFProbs    Minimal crf probabilities allowed: 0.9
+ 
+ 
+ 
+ 
+ 
+ -------------------------------- INPUTS --------------------------------
+ 
+ 
+ npl tagged file
+ 
+           SRR                        ...                                                                  REPO_FILE
+ 0  SRR5742248                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 5  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 7  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 
+ [3 rows x 15 columns]
+ 
+ ontology entities
+ 
+         TERM_ID                         TERM_NAME
+ 0  MCO000000014  generically dependent continuant
+ 1  MCO000000015                         radiation
+ 2  MCO000000016         electromagnetic radiation
+ 
+ additional ontology of synonyms (MCO-syn-json)
+ 
+                    ENTITY_NAME       TERM_ID       TERM_NAME
+ MCO000000019        continuant  MCO000000019                
+ MCO000002475    culture medium  MCO000002475                
+ MCO000002467_0        Organism  MCO000002467  biologicentity
+ 
+ 
+ -------------------------------- RESULTS --------------------------------
+ 
+ 
+ Tracking exact terms to MCO...
+ 
+ Mapping 4099 terms to MCO based on exact strings...
+ 
+ Mapping 3770 terms to MCO - synonyms based on exact strings...
+ 
+ Total of terms mapped by exact strings: 387
+ Saving filtered terms from raw mapping...
+ 
+ 
+ 3712 unmapped terms based on exact strings
+ Dropping duplicated unmapped term names...
+ 206 unmapped unique terms based on exact strings
+ 
+ compute string similarty...
+ 
+ Mapping to MCO 206 terms based on string similarity...
+ 
+ Mapping to MCO - synonyms 152 terms based on string siilarity..
+ 
+ Unique terms mapped by string similarity: 73
+ Total of terms mapped by string similarity: 1992
+ Saving filtered terms from str mapping...
+ 
+ 
+ --------------------END----------------------
+ Total of terms mapped: 2379
+ 
+ Total of terms unmapped: 1720
--- a/mapping_MCO/output/v3.1/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3.1/zika.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/zika.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3.1/zika_v3.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/zika_v3.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3/full_map_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/full_map_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3/full_unmap_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/full_unmap_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/raw_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3/raw_srr_htregulondb.tsv
View file @92c2d61
+++ b/mapping_MCO/output/raw_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3/raw_srr_htregulondb.tsv
View file @92c2d61
--- a/mapping_MCO/output/v3/sim_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/sim_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/full_map_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/full_map_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/full_unmap_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/full_unmap_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/raw_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/raw_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/sim_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/sim_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_full.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_full.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_map.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_map.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_unmap.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_unmap.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/srr_htregulondb_mapping_report_curated_terms.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/srr_htregulondb_mapping_report_curated_terms.out 0 → 100644
View file @92c2d61
+ /usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
+   warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
+ /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v6.py:313: SettingWithCopyWarning: 
+ A value is trying to be set on a copy of a slice from a DataFrame.
+ Try using .loc[row_indexer,col_indexer] = value instead
+ 
+ See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+   str_matches_odf["SOURCE"] = mco_ifile
+ 
+ 
+ -------------------------------- PARAMETERS --------------------------------
+ 
+ --inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+ --iAnnotatedFile Input file of npl tagged file: srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv
+ --iOntoFile      Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
+ --iLinksFile     Input file with links and id for the ontology (MCO-type-links): None
+ --iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
+ --outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v4/curated/
+ --outputFile     Output of the mapping process: srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv
+ --minPerMatch    Minimal string matching percentage: 80
+ --minCRFProbs    Minimal crf probabilities allowed: 0.9
+ 
+ 
+ 
+ 
+ 
+ -------------------------------- INPUTS --------------------------------
+ 
+ 
+ npl tagged file
+ 
+            SRR                        ...                                                                  REPO_FILE
+ 0    SRR771533                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 2    SRR771534                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 24  SRR3194453                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 
+ [3 rows x 14 columns]
+ 
+ ontology entities
+ 
+         TERM_ID                         TERM_NAME
+ 0  MCO000000014  generically dependent continuant
+ 1  MCO000000015                         radiation
+ 2  MCO000000016         electromagnetic radiation
+ 
+ additional ontology of synonyms (MCO-syn-json)
+ 
+                    ENTITY_NAME       TERM_ID       TERM_NAME
+ MCO000000019        continuant  MCO000000019                
+ MCO000002475    culture medium  MCO000002475                
+ MCO000002467_0        Organism  MCO000002467  biologicentity
+ 
+ 
+ -------------------------------- RESULTS --------------------------------
+ 
+ 
+ Tracking exact terms to MCO...
+ 
+ Mapping 2149 terms to MCO based on exact strings...
+ 
+ Mapping 1820 terms to MCO - synonyms based on exact strings...
+ 
+ Total of terms mapped by exact strings: 387
+ Saving filtered terms from raw mapping...
+ 
+ 
+ 1762 unmapped terms based on exact strings
+ Dropping duplicated unmapped term names...
+ 104 unmapped unique terms based on exact strings
+ 
+ compute string similarty...
+ 
+ Mapping to MCO 104 terms based on string similarity...
+ 
+ Mapping to MCO - synonyms 61 terms based on string siilarity..
+ 
+ Unique terms mapped by string similarity: 58
+ Total of terms mapped by string similarity: 1570
+ Saving filtered terms from str mapping...
+ 
+ 
+ --------------------END----------------------
+ Total of terms mapped: 1957
+ 
+ Total of terms unmapped: 192
--- a/mapping_MCO/output/v4/full_map_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/full_map_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/full_unmap_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/full_unmap_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/raw_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/raw_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/sim_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/sim_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/srr_htregulondb_mapping_report_v3.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/srr_htregulondb_mapping_report_v3.out 0 → 100644
View file @92c2d61
+ /usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
+   warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
+ /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning: 
+ A value is trying to be set on a copy of a slice from a DataFrame.
+ Try using .loc[row_indexer,col_indexer] = value instead
+ 
+ See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+   str_matches_odf["SOURCE"] = mco_ifile
+ 
+ 
+ -------------------------------- PARAMETERS --------------------------------
+ 
+ --inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+ --iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
+ --iOntoFile      Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
+ --iLinksFile     Input file with links and id for the ontology (MCO-type-links): None
+ --iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
+ --outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v3/
+ --outputFile     Output of the mapping process: srr_htregulondb.tsv
+ --minPerMatch    Minimal string matching percentage: 80
+ --minCRFProbs    Minimal crf probabilities allowed: 0.9
+ 
+ 
+ 
+ 
+ 
+ -------------------------------- INPUTS --------------------------------
+ 
+ 
+ npl tagged file
+ 
+           SRR                        ...                                                                  REPO_FILE
+ 0  SRR5742248                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 5  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 7  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 
+ [3 rows x 15 columns]
+ 
+ ontology entities
+ 
+         TERM_ID                         TERM_NAME
+ 0  MCO000000014  generically dependent continuant
+ 1  MCO000000015                         radiation
+ 2  MCO000000016         electromagnetic radiation
+ 
+ additional ontology of synonyms (MCO-syn-json)
+ 
+                    ENTITY_NAME       TERM_ID       TERM_NAME
+ MCO000000019        continuant  MCO000000019                
+ MCO000002475    culture medium  MCO000002475                
+ MCO000002467_0        Organism  MCO000002467  biologicentity
+ 
+ 
+ -------------------------------- RESULTS --------------------------------
+ 
+ 
+ Tracking exact terms to MCO...
+ 
+ Mapping 4099 terms to MCO based on exact strings...
+ 
+ Mapping 3770 terms to MCO - synonyms based on exact strings...
+ 
+ Total of terms mapped by exact strings: 387
+ Saving filtered terms from raw mapping...
+ 
+ 
+ 3712 unmapped terms based on exact strings
+ Dropping duplicated unmapped term names...
+ 206 unmapped unique terms based on exact strings
+ 
+ compute string similarty...
+ 
+ Mapping to MCO 206 terms based on string similarity...
+ 
+ Mapping to MCO - synonyms 152 terms based on string siilarity..
+ 
+ Unique terms mapped by string similarity: 73
+ Total of terms mapped by string similarity: 1992
+ Saving filtered terms from str mapping...
+ 
+ 
+ --------------------END----------------------
+ Total of terms mapped: 2379
+ 
+ Total of terms unmapped: 1720
--- a/mapping_MCO/output/v4/srr_htregulondb_mapping_report_v4.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/srr_htregulondb_mapping_report_v4.out 0 → 100644
View file @92c2d61
+ /usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
+   warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
+ /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v6.py:313: SettingWithCopyWarning: 
+ A value is trying to be set on a copy of a slice from a DataFrame.
+ Try using .loc[row_indexer,col_indexer] = value instead
+ 
+ See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+   str_matches_odf["SOURCE"] = mco_ifile
+ 
+ 
+ -------------------------------- PARAMETERS --------------------------------
+ 
+ --inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+ --iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
+ --iOntoFile      Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
+ --iLinksFile     Input file with links and id for the ontology (MCO-type-links): None
+ --iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
+ --outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v4/
+ --outputFile     Output of the mapping process: srr_htregulondb.tsv
+ --minPerMatch    Minimal string matching percentage: 80
+ --minCRFProbs    Minimal crf probabilities allowed: 0.9
+ 
+ 
+ 
+ 
+ 
+ -------------------------------- INPUTS --------------------------------
+ 
+ 
+ npl tagged file
+ 
+           SRR                        ...                                                                  REPO_FILE
+ 0  SRR5742248                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 5  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 7  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+ 
+ [3 rows x 14 columns]
+ 
+ ontology entities
+ 
+         TERM_ID                         TERM_NAME
+ 0  MCO000000014  generically dependent continuant
+ 1  MCO000000015                         radiation
+ 2  MCO000000016         electromagnetic radiation
+ 
+ additional ontology of synonyms (MCO-syn-json)
+ 
+                    ENTITY_NAME       TERM_ID       TERM_NAME
+ MCO000000019        continuant  MCO000000019                
+ MCO000002475    culture medium  MCO000002475                
+ MCO000002467_0        Organism  MCO000002467  biologicentity
+ 
+ 
+ -------------------------------- RESULTS --------------------------------
+ 
+ 
+ Tracking exact terms to MCO...
+ 
+ Mapping 3769 terms to MCO based on exact strings...
+ 
+ Mapping 3440 terms to MCO - synonyms based on exact strings...
+ 
+ Total of terms mapped by exact strings: 387
+ Saving filtered terms from raw mapping...
+ 
+ 
+ 3382 unmapped terms based on exact strings
+ Dropping duplicated unmapped term names...
+ 206 unmapped unique terms based on exact strings
+ 
+ compute string similarty...
+ 
+ Mapping to MCO 206 terms based on string similarity...
+ 
+ Mapping to MCO - synonyms 152 terms based on string siilarity..
+ 
+ Unique terms mapped by string similarity: 73
+ Total of terms mapped by string similarity: 1668
+ Saving filtered terms from str mapping...
+ 
+ 
+ --------------------END----------------------
+ Total of terms mapped: 2055
+ 
+ Total of terms unmapped: 1714
--- a/mapping_MCO/output/v4/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/reports/zika_mapping_report.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/reports/zika_mapping_report.out 0 → 100644
View file @92c2d61
+ 
+ 
+ -------------------------------- PARAMETERS --------------------------------
+ 
+ --inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+ --iAnnotatedFile Input file of npl tagged file: No_GSM_Metadata_Selected_v4.tsv
+ --outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
+ --outputFile     Output of the mapping process: zika.json
+ 
+ 
+ 
+ Total zika terms: 2351