upload

Estefani Gaytan Nunez
Commit 92c2d61e8012a5070f6bbf015c52cd08fa3643a2 92c2d61e 1 parent afa29bdb
Showing 56 changed files with 2812 additions and 3 deletions
extraction-geo/bin/DownloadProtocol_v3.R
mapping_MCO/bin/__pycache__/format_fun.cpython-36.pyc
mapping_MCO/bin/__pycache__/format_fun_v2.cpython-36.pyc
mapping_MCO/bin/__pycache__/format_fun_v6.cpython-36.pyc
mapping_MCO/bin/format_fun.py
mapping_MCO/bin/format_fun_v4.py
mapping_MCO/bin/format_fun_v6.py
mapping_MCO/bin/format_zika_v3.py
mapping_MCO/bin/format_zika_v4.py
mapping_MCO/bin/format_zika_v5.py
mapping_MCO/bin/mapping2MCO_v3.py
mapping_MCO/bin/mapping2MCO_v4.py
mapping_MCO/bin/mapping2MCO_v5.py
mapping_MCO/bin/mapping2MCO_v6.py
mapping_MCO/bin/step1_filter_curated_terms.py
mapping_MCO/input/No_GSM_Metadata_Selected_v4.tsv
mapping_MCO/input/correct_gc_terms_07_rev_Victor.csv
mapping_MCO/input/format_zika_v3.py
mapping_MCO/input/srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv
mapping_MCO/output/srr_htregulondb_mapped_map.json
--- a/extraction-geo/bin/DownloadProtocol_v3.R
View file @92c2d61
+++ b/extraction-geo/bin/DownloadProtocol_v3.R
View file @92c2d61
@@ -57,6 +57,12 @@ if (!length(opt)){
 ## Input files and output directories
 infoFile <- opt$infoFile
+if (!"gse" %in% names(gseInfo)){
+  stop("include at least gse column")
+}
+if (!"gsm" %in% names(gseInfo)){
+  gseInfor$gsm <- "GSM"
+}
 ## Load main variables
@@ -89,4 +95,4 @@ for (geoid in unique(gseInfo$gse)) {
 }
 cat("download id: ", length(list.dirs(opt$downloadPath, recursive = FALSE)))
-message("Required GSE: ", ngse_down)
\ No newline at end of file
+message("Required GSE: ", ngse_down)
--- a/mapping_MCO/bin/__pycache__/format_fun.cpython-36.pyc deleted 100644 → 0
View file @afa29bd
+++ b/mapping_MCO/bin/__pycache__/format_fun.cpython-36.pyc deleted 100644 → 0
View file @afa29bd
--- a/mapping_MCO/bin/__pycache__/format_fun_v2.cpython-36.pyc deleted 100644 → 0
View file @afa29bd
+++ b/mapping_MCO/bin/__pycache__/format_fun_v2.cpython-36.pyc deleted 100644 → 0
View file @afa29bd
--- a/mapping_MCO/bin/__pycache__/format_fun_v6.cpython-36.pyc 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/bin/__pycache__/format_fun_v6.cpython-36.pyc 0 → 100644
View file @92c2d61
--- a/mapping_MCO/bin/format_fun.py
View file @92c2d61
+++ b/mapping_MCO/bin/format_fun.py
View file @92c2d61
@@ -42,6 +42,11 @@ def get_crossref_info(info_df):
   - **pmid**: PubMed ID
 """
 def get_cite_info(info_df):
+    if(info_df.CASE_MATCH == "ZIKA"):
+        cite_dict ={
+            "evidence_id":"",
+            "evidence_name":"ZIKA",
+            "pmid" : info_df.PMID}
     cite_dict ={        
         "evidence_id": "",
         "evidence_name" : "NPL-CRF", #NPL
@@ -49,6 +54,8 @@ def get_cite_info(info_df):
     }
     return(cite_dict)
 def get_description(info_df):
+    if(info_df.CASE_MATCH=="ZIKA"):
+        mco_mapping = {}
     if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
         mco_mapping = {
             "type": "term present on MCO"
--- a/mapping_MCO/bin/format_fun_v4.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/format_fun_v4.py 0 → 100755
View file @92c2d61
+from numpy import nan
+from collections import OrderedDict
+from pandas import DataFrame as DF
+"""
+    - **name**:  nombre del termino registrado en la MCO
+    - **term_id**: identificador del termino en RegulonDB (si existe)
+    - **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
+    - **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
+    - **source**: fuente de los datos [ GEO,  ]
+    - **id**: identificador del registro de la base de datos o fuente de datos
+    - **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
+    - **associatedPhrase**: Frase de donde se tomo la informacion
+"""
+def get_term_info(info_df, source):
+    term_dict = {        
+        "name": info_df.TERM_NAME, #NPL output
+        "term_id" : info_df.TERM_ID, #MCO
+        "term_type": info_df.TERM_TYPE, #NPL
+        "source_data": info_df.REPO_FILE, #NPL
+        "source":  source,
+        "id": info_df.GSM, #NPL
+        "field":  info_df.BANGLINE, #NPL
+        "associatedPhrase":  info_df.FULL_TEXT #NPL
+    }
+    return(term_dict)
+
+
+"""
+    - **objectId**: Identificador en la base de datos fuente
+    - **externalCrossReferences_name**: nombre de la DB [ GEO ]
+"""
+def get_crossref_info(info_df, source):
+    crossref_dict ={        
+        "objectId": info_df.GSM, #NPL
+        "externalCrossReferences_name" : source
+    }
+    return(crossref_dict)
+
+"""
+  - **evidence_id**: Identificador de RegulondB asociado a la evidencia
+  - **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
+  - **pmid**: PubMed ID
+"""
+def get_cite_info(info_df, esource):
+    cite_dict ={        
+        "evidence_id": "",
+        "evidence_name" : esource,
+        "pmid": info_df.PMID
+    }
+    return(cite_dict)
+
+def get_description(info_df, no_map=False):
+    if(no_map):
+        mco_mapping = {
+            "type": "not present on MCO"
+        }
+    elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
+        mco_mapping = {
+            "type": "term present on MCO"
+            }
+    else:
+        mco_mapping = {
+            "type": "string similarity",
+            "score": info_df.SET
+        }
+    return(mco_mapping)
+"""
+#run it in the main for each field
+
+
+return: type
+
+id: string
+name: string
+description: string
+terms: list of dict
+externalCrossReferences: list of dict
+citations: list of dict
+
+"""
+
+def created_record(term_info_df, source = "GEO", no_map = False, esource = "NPL-CRF"):
+    record_dict = OrderedDict()
+    term_info_df = term_info_df.replace(nan, '', regex=True)
+    record_dict["id"] = term_info_df.TERM_ID #it should be add if it have been mapped
+    record_dict["name"] = term_info_df.TERM_NAME #a colum form NPL output
+    record_dict["description"] = [get_description(term_info_df, no_map=no_map)]
+    record_dict["terms"] = [get_term_info(term_info_df, source)]
+    record_dict["externalCrossReferences"] = [get_crossref_info(term_info_df, source)]
+    record_dict["citations"] = [get_cite_info(term_info_df, esource)]    
+    
+    return(record_dict)
+
+def json2DataFrame(data):
+    mco_syn_dic = dict()
+
+    for j,i in enumerate(data):    
+        if "regulondb_id" in i.keys():
+            
+            if "synonyms" in i.keys():
+                
+                for k,syn in enumerate(i['synonyms']):
+                    dict_key = i['regulondb_id']+"_"+str(k)
+                    mco_syn_dic[dict_key] = {
+                        #ENTITY_NAME
+                        'ENTITY_NAME' : i['name'], 
+                        #ENITY_SYN
+                        'TERM_NAME': syn.lower(),
+                        #regulondb_id
+                        'TERM_ID' : i['regulondb_id'] }
+
+            elif "hasRelatedSynonyms" in i.keys(): 
+
+                for k,syn in enumerate(i['hasRelatedSynonyms']):
+                    dict_key = i['regulondb_id']+"_"+str(k)
+                    mco_syn_dic[dict_key] = {
+                        #ENTITY_NAME
+                        'ENTITY_NAME' : i['name'], 
+                        #ENITY_SYN
+                        'TERM_NAME': syn.lower(),
+                        #regulondb_id
+                        'TERM_ID' : i['regulondb_id'] }
+            else:
+                dict_key = i['regulondb_id']
+                mco_syn_dic[dict_key] = {
+                    #ENTITY_NAME
+                    'ENTITY_NAME' : i['name'], 
+                    #ENITY_SYN
+                    'TERM_NAME': '',
+                    #regulondb_id
+                    'TERM_ID' : i['regulondb_id'] }
+            
+    mco_syn_df = DF.from_dict(mco_syn_dic).T
+
+
+    return(mco_syn_df)
+
--- a/mapping_MCO/bin/format_fun_v6.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/format_fun_v6.py 0 → 100755
View file @92c2d61
+from numpy import nan
+#from collections import OrderedDict
+from pandas import DataFrame as DF
+import json
+from collections import defaultdict
+import format_fun_v6 as format_fun
+
+def to_json(df, source_info, evidence_source, ofname):
+    df_terms = defaultdict(list)
+        
+    for idx,row in df.iterrows():
+        term_record = format_fun.get_term_info(
+            row, 
+            source = source_info, 
+            map= row.MAP)
+        df_terms[row.SRR].append(term_record)        
+    
+    df_json = {}
+    df_tmp = df.drop_duplicates("SRR", keep="first")
+    for idx,row in df_tmp.iterrows():        
+        srr_record = format_fun.created_record(
+            info_df = row, 
+            term_list = df_terms[row.SRR],
+            source = source_info,
+            esource = evidence_source)        
+        df_json[row.SRR] = srr_record
+
+    with open(ofname, "w") as output:
+        json.dump(df_json, output, separators=(',', ':'), indent=4)
+
+def get_score(info_df):
+    if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
+        subtext = "term present on MCO"
+    else:
+        mco_mapping = {
+            "type": "string similarity",
+            "score": info_df.SET
+        }
+    return(mco_mapping)
+"""
+    - **name**:  nombre del termino registrado en la MCO
+    - **term_id**: identificador del termino en RegulonDB (si existe)
+    - **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
+    - **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
+    - **source**: fuente de los datos [ GEO,  ]
+    - **id**: identificador del registro de la base de datos o fuente de datos
+    - **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
+    - **associatedPhrase**: Frase de donde se tomo la informacion
+"""
+def get_term_info(info_df, source, map=True):
+    info_df = info_df.replace(nan, "", regex=True)
+    term_dict = {        
+        "name": info_df.TERM_NAME, #NPL output
+        "term_id" : info_df.TERM_ID, #MCO
+        "term_type": info_df.TERM_TYPE, #NPL
+        "source_data": {
+            "source":  source,
+            "id": info_df.GSM, #NPL
+            "field":  info_df.BANGLINE, #NPL
+            "associatedPhrase":  info_df.FULL_TEXT, #NPL
+            "description" : get_description(info_df, map),
+            "similarity_percentage" : info_df.SET
+        }
+    }
+    return(term_dict)
+
+
+"""
+    - **objectId**: Identificador en la base de datos fuente
+    - **externalCrossReferences_name**: nombre de la DB [ GEO ]
+"""
+def get_crossref_info(info_df, source):
+    crossref_dict ={        
+        "objectId": info_df.GSM, #NPL
+        "externalCrossReferences_name" : source
+    }
+    return(crossref_dict)
+
+"""
+  - **evidence_id**: Identificador de RegulondB asociado a la evidencia
+  - **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
+  - **pmid**: PubMed ID
+"""
+def get_cite_info(info_df, esource):
+    cite_dict ={        
+        "evidence_id": "",
+        "evidence_name" : esource,
+        "pmid": info_df.PMID
+    }
+    return(cite_dict)
+
+def get_description(info_df, map=True):
+    if not map:
+        subtext = "absent in RegulonDB MCO"
+    elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
+        subtext = "RegulonDB MCO term"
+    else:
+        subtext = "Similar term in RegulonDB MCO"
+    return(subtext)
+"""
+#run it in the main for each field
+
+
+return: type
+
+id: string
+name: string
+description: string
+terms: list of dict
+externalCrossReferences: list of dict
+citations: list of dict
+
+"""
+
+def created_record(info_df, term_list, source = "GEO", esource = "NPL-CRF"):
+    #record_dict = OrderedDict()    
+    record_dict = {}
+    info_df = info_df.replace(nan, "", regex=True)
+    record_dict["id"] = ""
+    record_dict["name"] = ""
+    record_dict["description"] = ""
+    record_dict["terms"] = term_list
+    record_dict["externalCrossReferences"] = [get_crossref_info(info_df, source)]
+    record_dict["citations"] = [get_cite_info(info_df, esource)]    
+    
+    return(record_dict)
+
+def json2DataFrame(data):
+    mco_syn_dic = dict()
+
+    for j,i in enumerate(data):    
+        if "regulondb_id" in i.keys():
+            
+            if "synonyms" in i.keys():
+                
+                for k,syn in enumerate(i['synonyms']):
+                    dict_key = i['regulondb_id']+"_"+str(k)
+                    mco_syn_dic[dict_key] = {
+                        #ENTITY_NAME
+                        'ENTITY_NAME' : i['name'], 
+                        #ENITY_SYN
+                        'TERM_NAME': syn.lower(),
+                        #regulondb_id
+                        'TERM_ID' : i['regulondb_id'] }
+
+            elif "hasRelatedSynonyms" in i.keys(): 
+
+                for k,syn in enumerate(i['hasRelatedSynonyms']):
+                    dict_key = i['regulondb_id']+"_"+str(k)
+                    mco_syn_dic[dict_key] = {
+                        #ENTITY_NAME
+                        'ENTITY_NAME' : i['name'], 
+                        #ENITY_SYN
+                        'TERM_NAME': syn.lower(),
+                        #regulondb_id
+                        'TERM_ID' : i['regulondb_id'] }
+            else:
+                dict_key = i['regulondb_id']
+                mco_syn_dic[dict_key] = {
+                    #ENTITY_NAME
+                    'ENTITY_NAME' : i['name'], 
+                    #ENITY_SYN
+                    'TERM_NAME': '',
+                    #regulondb_id
+                    'TERM_ID' : i['regulondb_id'] }
+            
+    mco_syn_df = DF.from_dict(mco_syn_dic).T
+
+
+    return(mco_syn_df)
+
--- a/mapping_MCO/bin/format_zika_v3.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/format_zika_v3.py 0 → 100755
View file @92c2d61
+# -*- coding: utf-8 -*-
+"""
+#Setup
+"""
+
+#################### Setup ####################
+from collections import defaultdict
+from optparse import OptionParser
+import os
+from numpy.core.fromnumeric import sort
+from pandas import read_csv, DataFrame, merge, concat, read_table
+from numpy import exp, nan
+import seaborn as sns
+from numpy import mean
+ 
+import matplotlib.pyplot as plt 
+import matplotlib
+matplotlib.style.use('ggplot')
+# %matplotlib inline
+
+from collections import Counter
+import json
+
+from fuzzywuzzy import fuzz
+from fuzzywuzzy import process
+
+import format_fun
+import mapping_fun
+import sys
+
+"""
+# input parameters
+--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
+--iOntoFile         gc_ontology_terms_v2.txt
+--iLinksFile        gc_ontology_terms_link_v2.txt  
+--iSynFile          mco_terms_v0.2.json
+--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
+--outputFile        all_srr_IV_mapped.tsv
+--minPerMatch       90
+
+
+#Example
+# nohup python3 mapping2MCO_v3.py --inputPath      /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile      gc_ontology_terms_v2.txt   --iSynFile       mco_terms_v0.2.json --outputPath     /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile     srr_htregulondb_mapped.tsv --minPerMatch  80  --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
+"""
+#################### Defining parameters ####################
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option(
+        "--inputPath",
+        dest="input_path",
+        help="Path of npl tagged file (crf output)",
+        metavar="PATH")
+    parser.add_option(
+         "--iAnnotatedFile",
+        dest="npl_fname",
+        help="Input file of npl tagged file (crf output)",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--iOntoFile",
+        dest="onto_fname",
+        help="Input file with the ontology entities",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--iLinksFile",
+        dest="links_fname",
+        help="Input file with links and id for the ontology",
+        metavar="FILE",
+        default=None)
+    parser.add_option(
+         "--iSynFile",
+        dest="syn_fname",
+        help="Input file for the additional ontology of synonyms",
+        metavar="FILE",
+        default=None)
+    parser.add_option(
+         "--outputPath",
+        dest="output_path",
+        help="Output path to place output files",
+        metavar="PATH")
+    parser.add_option(
+         "--outputFile",
+        dest="out_fname",
+        help="Output file name for the mapping process",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--minPerMatch",
+        dest="min_score",
+        help="Minimal string matching percentage")  
+    parser.add_option(
+         "--minCRFProbs",
+        dest="min_probs",
+        help="Minimal crf probabilities")
+    
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    #################### DISP PARAMETERS ####################
+    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
+    print("--inputPath      Path of npl tagged file: " + str(options.input_path))
+    print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
+    print("--iOntoFile      Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
+    print("--iLinksFile     Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
+    print("--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
+    print("--outputPath     Output path to place output files: " + str(options.output_path))
+    print("--outputFile     Output of the mapping process: " + str(options.out_fname))
+    print("--minPerMatch    Minimal string matching percentage: " + str(options.min_score))
+    print("--minCRFProbs    Minimal crf probabilities allowed: " + str(options.min_probs))
+
+    print("\n\n")
+    repognrl = "http://pakal.ccg.unam.mx/cmendezc"
+    reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
+    repo_url = '/'.join([repognrl,reponame])
+    
+    # Input files
+    min_score = int(options.min_score)
+    min_probs = float(options.min_probs)
+    npl_ifile =  os.path.join(options.input_path, options.npl_fname)
+    mco_ifile =  os.path.join(options.input_path, options.onto_fname)
+    mco_syn_ifile =  os.path.join(options.input_path, options.syn_fname)
+
+    #Output files
+    raw_ofname = "_".join(["raw", options.out_fname])
+    rawmap_ofile =  os.path.join(options.output_path, raw_ofname)
+    str_ofname = "_".join(["sim", options.out_fname])
+    strmap_ofile =  os.path.join(options.output_path, str_ofname)
+
+    full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
+    full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
+
+    json_ofile = os.path.join(options.output_path, options.out_fname)
+    json_ofile_map  = json_ofile.replace(".tsv", "_map.json")
+    json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
+    json_ofile_list= json_ofile.replace(".tsv", "_list.json")
+    json_ofile_df_list= json_ofile.replace(".tsv", "_df_list.json")
+    
+    #################### Load input data ####################
+    # Load CRF-annotation
+    exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
+    npl_full = read_table(npl_ifile,  sep = "\t")
+        
+    obs_cols = set(npl_full.columns)
+
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        ocol = ", ".join(list(exp_cols))        
+        sys.exit(ocol + " expected columns for iAnnotatedFile" )
+
+    #Load MCO term names
+    exp_cols = {"TERM_ID", "TERM_NAME"}
+    mco_df_full = read_table(mco_ifile,  sep = "\t")
+    obs_cols = set(mco_df_full.columns)
+
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
+        
+    mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
+    mco_df = mco_df.drop_duplicates(keep="first")
+    mco_df = mco_df.dropna()
+
+    #Load MCO links
+    if options.links_fname is not None:
+        print("\nLoad types...")
+        mcolink_ifile =  os.path.join(options.input_path, options.links_fname)
+        exp_cols = {"TERM_ID", "TERM_TYPE"}
+        mco_links_full = read_table(mcolink_ifile, sep = "\t")
+
+        obs_cols = set(mco_links_full.columns)
+
+        if exp_cols.intersection(obs_cols) != exp_cols:
+            sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
+
+        mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
+        mco_links = mco_links.drop_duplicates(keep="first")
+        mco_links = mco_links.dropna()
+    else:
+        mco_links = None
+
+    #Load MCO terms synonyms
+    #format json from mco to dataframe
+    mco_json = open(mco_syn_ifile )
+    data = json.load(mco_json)
+    mco_syn = format_fun.json2DataFrame(data)
+    
+    df_json = defaultdict(list)
+        
+    for idx,row in npl_full.iterrows():
+        record = format_fun.created_record(row)
+        df_json[row.SRR].append(record)
+    
+    df_json
+    with open(json_ofile_list, "w") as output:
+        json.dump(format_fun.created_record(df_json), output)
+
+    with open(json_ofile_df_list, "a") as output:
+        for idx,row in df_json.items():
+            json.dump(format_fun.created_record(row), output)
\ No newline at end of file
--- a/mapping_MCO/bin/format_zika_v4.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/format_zika_v4.py 0 → 100755
View file @92c2d61
+# -*- coding: utf-8 -*-
+"""
+#Setup
+"""
+
+#################### Setup ####################
+from collections import defaultdict
+from optparse import OptionParser
+import os
+from pandas import read_csv, DataFrame, merge, concat, read_table
+from numpy import exp, nan, mean
+import json
+import format_fun_v4 as format_fun
+import sys
+
+"""
+# input parameters
+--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
+--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
+--outputFile        all_srr_IV_mapped.tsv
+
+
+#Example
+# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v4.py --inputPath  /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v3.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile  zika_mapped.json
+"""
+#################### Defining parameters ####################
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option(
+        "--inputPath",
+        dest="input_path",
+        help="Path of npl tagged file (crf output)",
+        metavar="PATH")
+    parser.add_option(
+         "--iAnnotatedFile",
+        dest="npl_fname",
+        help="Input file of npl tagged file (crf output)",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--outputPath",
+        dest="output_path",
+        help="Output path to place output files",
+        metavar="PATH")
+    parser.add_option(
+         "--outputFile",
+        dest="out_fname",
+        help="Output file name for the mapping process",
+        metavar="FILE",
+        default="")
+    
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    #################### DISP PARAMETERS ####################
+    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
+    print("--inputPath      Path of npl tagged file: " + str(options.input_path))
+    print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
+    print("--outputPath     Output path to place output files: " + str(options.output_path))
+    print("--outputFile     Output of the mapping process: " + str(options.out_fname))    
+
+    print("\n\n")
+    
+    # Input files        
+    npl_ifile =  os.path.join(options.input_path, options.npl_fname)
+
+    #Output files
+    ofname = os.path.join(options.output_path, options.out_fname)
+    
+    #################### Load input data ####################
+    # Load CRF-annotation
+    exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
+    npl_full = read_table(npl_ifile,  sep = "\t")
+        
+    obs_cols = set(npl_full.columns)
+
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        ocol = ", ".join(list(exp_cols))        
+        sys.exit(ocol + " expected columns for iAnnotatedFile" )
+    
+    df_json = defaultdict(list)
+        
+    for idx,row in npl_full.iterrows():
+        record = format_fun.created_record(row, source = "ZIKAdb", no_map = True, esource = "database")
+        if(idx<2): print(record)
+        #record_json = json.dumps(record)
+        record_json = record
+        df_json[row.SRR].append(record_json)
+
+    """
+    with open(ofname, "a") as output:
+        output.write("field:[")
+        sep=""
+        for k,v in df_json.items():            
+            output.write(sep)
+            json.dump(v, output)            
+            sep=","
+        output.write("]")
+
+    """
+    with open(ofname, "a") as output:
+        output.write("{")
+        sep=""
+        for k,v in df_json.items():            
+            output.write(sep)
+            output.write("\""+k+"\"")
+            output.write(":")
+            record_list = {                
+                "growth_conditions": df_json[k]
+            }
+            json.dump(record_list, output)            
+            sep=","
+        output.write("}")
+    
+    df=open(ofname)
+    df=json.load(df)
+        
--- a/mapping_MCO/bin/format_zika_v5.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/format_zika_v5.py 0 → 100755
View file @92c2d61
+# -*- coding: utf-8 -*-
+"""
+#Setup
+"""
+
+#################### Setup ####################
+from optparse import OptionParser
+import os
+from pandas import read_csv, DataFrame, merge, concat, read_table
+from numpy import mean
+import format_fun_v6 as format_fun
+import sys
+
+"""
+# input parameters
+--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
+--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
+--outputFile        all_srr_IV_mapped.tsv
+
+
+#Example
+# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath  /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile  zika.json > automatic-extraction-growth-conditions/mapping_MCO/reports/zika_formated_report.out
+# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath  /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/test/ --outputFile  zika_mapped.json > automatic-extraction-growth-conditions/mapping_MCO/test/zika_mapping_report.out
+
+"""
+#################### Defining parameters ####################
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option(
+        "--inputPath",
+        dest="input_path",
+        help="Path of npl tagged file (crf output)",
+        metavar="PATH")
+    parser.add_option(
+         "--iAnnotatedFile",
+        dest="npl_fname",
+        help="Input file of npl tagged file (crf output)",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--outputPath",
+        dest="output_path",
+        help="Output path to place output files",
+        metavar="PATH")
+    parser.add_option(
+         "--outputFile",
+        dest="out_fname",
+        help="Output file name for the mapping process",
+        metavar="FILE",
+        default="")
+    
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    #################### DISP PARAMETERS ####################
+    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
+    print("--inputPath      Path of npl tagged file: " + str(options.input_path))
+    print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
+    print("--outputPath     Output path to place output files: " + str(options.output_path))
+    print("--outputFile     Output of the mapping process: " + str(options.out_fname))    
+
+    print("\n\n")
+    
+    # Input files        
+    npl_ifile =  os.path.join(options.input_path, options.npl_fname)
+
+    #Output files
+    ofname = os.path.join(options.output_path, options.out_fname)
+    
+    #################### Load input data ####################
+    # Load CRF-annotation
+    exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
+    npl_full = read_table(npl_ifile,  sep = "\t")
+    npl_full = npl_full.drop_duplicates()
+
+    print(f"Total zika terms: {len(npl_full)} ")
+    obs_cols = set(npl_full.columns)
+
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        ocol = ", ".join(list(exp_cols))        
+        sys.exit(ocol + " expected columns for iAnnotatedFile" )
+    """
+    df_terms = defaultdict(list)
+        
+    for idx,row in npl_full.iterrows():
+        term_record = format_fun.get_term_info(row, source = "ZIKAdb", map=False)        
+        df_terms[row.SRR].append(term_record)        
+    
+    df_json = {}
+    df_tmp = npl_full.drop_duplicates("SRR", keep="first")
+    for idx,row in df_tmp.iterrows():        
+        srr_record = format_fun.created_record(
+            info_df = row, 
+            term_list = df_terms[row.SRR],
+            source = "ZIKAdb",             
+            esource = "database")        
+        df_json[row.SRR] = srr_record
+
+    with open(ofname, "w") as output:
+        json.dump(df_json, output, separators=(',', ':'), indent=4)
+
+    df=open(ofname)
+    df=json.load(df)
+    print(df["ERR1399578"])
+    """
+    npl_full["MAP"] = False
+    format_fun.to_json(
+        df = npl_full,        
+        source_info = "ZIKAdb",
+        evidence_source = "database", 
+        ofname = ofname)
\ No newline at end of file
--- a/mapping_MCO/bin/mapping2MCO_v3.py
View file @92c2d61
+++ b/mapping_MCO/bin/mapping2MCO_v3.py
View file @92c2d61
@@ -23,7 +23,8 @@ import json
 from fuzzywuzzy import fuzz
 from fuzzywuzzy import process
-import format_fun
+#import format_fun
+import format_fun_v4 as format_fun
 import mapping_fun
 import sys
@@ -338,5 +339,6 @@ if __name__ == "__main__":
     with open(json_ofile_unmap, "a") as output:
         for idx,row in full_unmap.iterrows():
             json.dump(format_fun.created_record(row), output)
+        
+    
-    
\ No newline at end of file
--- a/mapping_MCO/bin/mapping2MCO_v4.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/mapping2MCO_v4.py 0 → 100755
View file @92c2d61
+# -*- coding: utf-8 -*-
+"""
+#Setup
+"""
+
+#################### Setup ####################
+from optparse import OptionParser
+import os
+from numpy.core.fromnumeric import sort
+from pandas import read_csv, DataFrame, merge, concat, read_table
+from numpy import exp, nan
+import seaborn as sns
+from numpy import mean
+ 
+import matplotlib.pyplot as plt 
+import matplotlib
+matplotlib.style.use('ggplot')
+# %matplotlib inline
+
+from collections import Counter, defaultdict
+import json
+
+from fuzzywuzzy import fuzz
+from fuzzywuzzy import process
+
+#import format_fun
+import format_fun_v4 as format_fun
+import mapping_fun
+import sys
+
+"""
+# input parameters
+--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
+--iOntoFile         gc_ontology_terms_v2.txt
+--iLinksFile        gc_ontology_terms_link_v2.txt  
+--iSynFile          mco_terms_v0.2.json
+--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
+--outputFile        all_srr_IV_mapped.tsv
+--minPerMatch       90
+
+
+#Example
+# nohup python3 mapping2MCO_v3.py --inputPath      /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile      gc_ontology_terms_v2.txt   --iSynFile       mco_terms_v0.2.json --outputPath     /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile     srr_htregulondb_mapped.tsv --minPerMatch  80  --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
+"""
+#################### Defining parameters ####################
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option(
+        "--inputPath",
+        dest="input_path",
+        help="Path of npl tagged file (crf output)",
+        metavar="PATH")
+    parser.add_option(
+         "--iAnnotatedFile",
+        dest="npl_fname",
+        help="Input file of npl tagged file (crf output)",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--iOntoFile",
+        dest="onto_fname",
+        help="Input file with the ontology entities",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--iLinksFile",
+        dest="links_fname",
+        help="Input file with links and id for the ontology",
+        metavar="FILE",
+        default=None)
+    parser.add_option(
+         "--iSynFile",
+        dest="syn_fname",
+        help="Input file for the additional ontology of synonyms",
+        metavar="FILE",
+        default=None)
+    parser.add_option(
+         "--outputPath",
+        dest="output_path",
+        help="Output path to place output files",
+        metavar="PATH")
+    parser.add_option(
+         "--outputFile",
+        dest="out_fname",
+        help="Output file name for the mapping process",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--minPerMatch",
+        dest="min_score",
+        help="Minimal string matching percentage")  
+    parser.add_option(
+         "--minCRFProbs",
+        dest="min_probs",
+        help="Minimal crf probabilities")
+    
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    #################### DISP PARAMETERS ####################
+    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
+    print("--inputPath      Path of npl tagged file: " + str(options.input_path))
+    print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
+    print("--iOntoFile      Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
+    print("--iLinksFile     Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
+    print("--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
+    print("--outputPath     Output path to place output files: " + str(options.output_path))
+    print("--outputFile     Output of the mapping process: " + str(options.out_fname))
+    print("--minPerMatch    Minimal string matching percentage: " + str(options.min_score))
+    print("--minCRFProbs    Minimal crf probabilities allowed: " + str(options.min_probs))
+
+    print("\n\n")
+    repognrl = "http://pakal.ccg.unam.mx/cmendezc"
+    reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
+    repo_url = '/'.join([repognrl,reponame])
+    
+    # Input files
+    min_score = int(options.min_score)
+    min_probs = float(options.min_probs)
+    npl_ifile =  os.path.join(options.input_path, options.npl_fname)
+    mco_ifile =  os.path.join(options.input_path, options.onto_fname)
+    mco_syn_ifile =  os.path.join(options.input_path, options.syn_fname)
+
+    #Output files
+    raw_ofname = "_".join(["raw", options.out_fname])
+    rawmap_ofile =  os.path.join(options.output_path, raw_ofname)
+    str_ofname = "_".join(["sim", options.out_fname])
+    strmap_ofile =  os.path.join(options.output_path, str_ofname)
+
+    full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
+    full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
+
+    json_ofile = os.path.join(options.output_path, options.out_fname)
+    json_ofile_map  = json_ofile.replace(".tsv", "_map.json")
+    json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
+    
+    #################### Load input data ####################
+    # Load CRF-annotation
+    exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
+    npl_full = read_table(npl_ifile,  sep = "\t")
+    
+    obs_cols = set(npl_full.columns)
+
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        ocol = ", ".join(list(exp_cols))        
+        sys.exit(ocol + " expected columns for iAnnotatedFile" )
+
+    npl_df = npl_full[npl_full.PROB >= min_probs]
+    npl_df = npl_df.drop_duplicates(keep="first")
+    npl_df = npl_df.dropna()
+    
+
+    #Cleaning input
+    npl_df['TERM_TYPE'] = [mapping_fun.transterm_npl2mco(term) for term in npl_df.TERM_TYPE]     
+    #filter non-mco terms types
+    npl_df = npl_df[npl_df.TERM_TYPE != "exTag Type"]
+     
+
+    #add repofile_ source. access to stored files at gitLab    
+    source_access = ['/'.join([repo_url,gse,gse+'.soft.gz']) for gse in npl_df['GSE']]
+    npl_df['REPO_FILE'] = source_access
+    
+    ##remove additional spaces
+    npl_df['TERM_NAME'] = [txt.strip() for txt in npl_df['TERM_NAME']]
+    npl_df['PMID'] = [pmid.replace("PMID_", "") for pmid in npl_df['PMID']]
+
+
+    #Load MCO term names
+    exp_cols = {"TERM_ID", "TERM_NAME"}
+    mco_df_full = read_table(mco_ifile,  sep = "\t")
+    obs_cols = set(mco_df_full.columns)
+
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
+        
+    mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
+    mco_df = mco_df.drop_duplicates(keep="first")
+    mco_df = mco_df.dropna()
+
+    #Load MCO links
+    if options.links_fname is not None:
+        print("\nLoad types...")
+        mcolink_ifile =  os.path.join(options.input_path, options.links_fname)
+        exp_cols = {"TERM_ID", "TERM_TYPE"}
+        mco_links_full = read_table(mcolink_ifile, sep = "\t")
+
+        obs_cols = set(mco_links_full.columns)
+
+        if exp_cols.intersection(obs_cols) != exp_cols:
+            sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
+
+        mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
+        mco_links = mco_links.drop_duplicates(keep="first")
+        mco_links = mco_links.dropna()
+    else:
+        mco_links = None
+
+    #Load MCO terms synonyms
+    #format json from mco to dataframe
+    mco_json = open(mco_syn_ifile )
+    data = json.load(mco_json)
+    mco_syn = format_fun.json2DataFrame(data)
+
+
+    print('\n\n-------------------------------- INPUTS --------------------------------\n')
+
+
+    print("\nnpl tagged file\n")
+    print(npl_df.head(3))
+    print("\nontology entities\n")
+    print(mco_df.head(3))
+    if options.links_fname is not None:
+        print("\nlinks and id for the ontology (MCO-type-links)\n")
+        print(mco_links.head(3))
+    print("\nadditional ontology of synonyms (MCO-syn-json)\n")
+    print(mco_syn.head(3))
+
+
+    print('\n\n-------------------------------- RESULTS --------------------------------\n')
+   
+    #################### mappping to MCO exact string ####################
+    #npl_df = npl_df.drop_duplicates("TERM_NAME",  keep="first")
+    #npl_df = npl_df.head(10)
+
+    print(f"\nMapping {len(npl_df.index)} terms to MCO based on exact strings...\n")
+    
+    #first mapping
+    raw_matches = mapping_fun.raw_map_mco(
+        npl_df = npl_df, 
+        mco_df = mco_df, 
+        mco_links = mco_links, 
+        unmap = True)
+
+    #save file name source of the raw mapping
+    raw_matches["SOURCE"] = mco_ifile
+    #additional column to merge
+    raw_matches["ENTITY_NAME"] = ""
+    
+    #################### mappping to MCO.syn exact string ####################
+    
+    #define unmapped
+    raw_mco_unmap = raw_matches[raw_matches.isna().TERM_ID]
+    #input for te second step
+    raw_mco_unmap = raw_mco_unmap[list(npl_df.columns)]
+
+    print(f"\nMapping {len(raw_mco_unmap.index)} terms to MCO - synonyms based on exact strings...\n")
+    
+    #second mapping
+    raw_matches_syn = mapping_fun.raw_map_mco(
+        npl_df = raw_mco_unmap, 
+        mco_df = mco_syn, 
+        unmap = True)
+    
+    #additional column to merge
+    raw_matches_syn["SOURCE"] = mco_syn_ifile
+    #raw_matches_syn["TERM_TYPE"] = ""
+    
+    #################### save mapped terms based on exact strings ####################
+    
+    #all mapped
+    raw_map_odf = concat([raw_matches, raw_matches_syn], sort=True).dropna()
+    
+    print(raw_map_odf.head(3))    
+    print(f"Total of terms mapped by exact strings: {len(raw_map_odf.index)}")
+    print("Saving filtered terms from raw mapping...\n\n")
+    
+    raw_map_odf.to_csv(rawmap_ofile, sep = "\t", header =True, index=False)
+
+    #################### unmmaped raw terms ####################     
+    raw_mco_syn_unmap =  raw_matches_syn[raw_matches_syn.isna().TERM_ID]
+    raw_mco_syn_unmap = raw_mco_syn_unmap[list(npl_df.columns)]
+        
+    print(f"\n{len(raw_mco_syn_unmap.index)} unmapped terms based on exact strings")
+    print("Dropping duplicated unmapped term names...\n")
+    raw_mco_syn_unmap = raw_mco_syn_unmap.drop_duplicates("TERM_NAME")        
+    
+    print(f"{len(raw_mco_syn_unmap.index)} unmapped unique terms based on exact strings\n")
+
+    #################### string similarity mapping ####################
+    ###Matching unmaped term names    
+    print(f"\nMapping to MCO {len(raw_mco_syn_unmap.index)} terms based on string similarity...\n")
+
+    str_matches = mapping_fun.str_match_map_mco(raw_mco_syn_unmap, mco_df, mco_links = mco_links,  min_match=0, npl_merges=False)
+    str_matches_odf = str_matches[str_matches.SET >= min_score]
+    str_matches_odf["SOURCE"] = mco_ifile    
+
+    #################### unmmaped sim terms (MCO) ####################
+    str_mco_unmap = str_matches[str_matches.SET < min_score]
+    #str_mco_unmap = str_mco_unmap[list(npl_df.columns)]
+    str_mco_unmap = str_mco_unmap.drop_duplicates("TERM_NAME")
+
+    print(f"\nMapping to MCO - synonyms {len(str_mco_unmap.index)} terms based on string siilarity..\n")
+    str_matches_syn = mapping_fun.str_match_map_mco(str_mco_unmap, mco_syn, min_match=min_score, npl_merges=False)
+    str_matches_syn_odf = str_matches_syn[str_matches_syn.SET >= min_score]
+    str_matches_syn_odf["SOURCE"] = mco_syn_ifile
+  
+    #################### save str-sim map terms ####################    
+    all_str_matches_odf =  concat([str_matches_odf, str_matches_syn_odf], sort = True).dropna()     
+        
+    print(f"Unique terms mapped by string similarity: {len(all_str_matches_odf.index)}")
+
+    all_str_matches_npl_odf = merge(npl_df, all_str_matches_odf, on = ["TERM_NAME"], how="inner")    
+    
+    print(f"Total of terms mapped by string similarity: {len(all_str_matches_npl_odf.index)}")
+    print("Saving filtered terms from str mapping...")
+    
+    all_str_matches_npl_odf.to_csv(strmap_ofile, sep = "\t", header =True, index=False)
+
+    #################### Formatting json ####################
+    raw_map_odf["CASE_MATCH"] = "MCO"
+    raw_map_odf["SET"] = 100
+    raw_map_odf["SORT"] = 100
+
+    full_map = concat([all_str_matches_npl_odf, raw_map_odf], sort = True)
+    
+    full_map.to_csv(full_ofile, sep = "\t", header =True, index=False)
+
+    print(f"Total of terms mapped: {len(full_map.index)}")
+    
+    df_json = defaultdict(list)
+        
+    for idx,row in full_map.iterrows():
+        record = format_fun.created_record(row)
+        record_json = record
+        df_json[row.SRR].append(record_json)
+        if(idx <2):
+            print(record_json)
+    
+    with open(json_ofile_map, "a") as output:
+        output.write("{")
+        sep=""
+        for k,v in df_json.items():
+            if v!={}:                
+                output.write(sep)
+                output.write("\""+k+"\"")
+                output.write(":")
+                record_list = {                
+                    "growth_conditions": v
+                }
+                json.dump(record_list, output)            
+                sep=","
+        output.write("}")
+    
+    df=open(json_ofile_map)
+    df=json.load(df)
+
+    
+    full_unmap = merge(npl_df, full_map[["TERM_NAME", "TERM_ID"]], on = ["TERM_NAME"], how='left')
+    full_unmap = full_unmap[full_unmap.isna().TERM_ID]
+    print(full_unmap.head(3))
+    
+    print(f"Total of terms unmapped: {len(full_unmap.index)}")
+    
+    full_unmap["SOURCE"] = ""
+    full_unmap["CASE_MATCH"] = ""
+    full_unmap["SET"] = 0
+    full_unmap["SORT"] = 0
+
+    full_unmap.to_csv(full_unmap_ofile, sep = "\t", header =True, index=False)
+
+    df_json = defaultdict(list)
+        
+    for idx,row in full_unmap.iterrows():
+        record = format_fun.created_record(row, no_map=True)
+        #record_json = json.dumps(record)
+        record_json = record
+        df_json[row.SRR].append(record_json)
+        if(idx <2):
+            print(record_json)
+    
+    with open(json_ofile_unmap, "a") as output:
+        output.write("{")
+        sep=""
+        for k,v in df_json.items():            
+            output.write(sep)
+            output.write("\""+k+"\"")
+            output.write(":")
+            record_list = {                
+                "growth_conditions": df_json[k]
+            }
+            json.dump(record_list, output)            
+            sep=","
+        output.write("}")
+    
+    df=open(json_ofile_unmap)
+    df=json.load(df)
+        
+    
+    
--- a/mapping_MCO/bin/mapping2MCO_v5.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/mapping2MCO_v5.py 0 → 100755
View file @92c2d61
+# -*- coding: utf-8 -*-
+"""
+#Setup
+"""
+
+#################### Setup ####################
+from optparse import OptionParser
+import os
+from numpy.core.fromnumeric import sort
+from pandas import read_csv, DataFrame, merge, concat, read_table
+from numpy import exp, nan
+import seaborn as sns
+from numpy import mean
+ 
+import matplotlib.pyplot as plt 
+import matplotlib
+matplotlib.style.use('ggplot')
+# %matplotlib inline
+
+from collections import Counter, defaultdict
+import json
+
+from fuzzywuzzy import fuzz
+from fuzzywuzzy import process
+
+#import format_fun
+import format_fun_v6 as format_fun
+import mapping_fun
+import sys
+
+"""
+# input parameters
+--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
+--iOntoFile         gc_ontology_terms_v2.txt
+--iLinksFile        gc_ontology_terms_link_v2.txt  
+--iSynFile          mco_terms_v0.2.json
+--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
+--outputFile        all_srr_IV_mapped.tsv
+--minPerMatch       90
+
+
+#Example
+# nohup python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py --inputPath      /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile      gc_ontology_terms_v2.txt   --iSynFile       mco_terms_v0.2.json --outputPath     /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile     srr_htregulondb_v2.tsv --minPerMatch  80  --minCRFProbs 0.9 > /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/srr_htregulondb_mapping_report_v2.out &
+"""
+#################### Defining parameters ####################
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option(
+        "--inputPath",
+        dest="input_path",
+        help="Path of npl tagged file (crf output)",
+        metavar="PATH")
+    parser.add_option(
+         "--iAnnotatedFile",
+        dest="npl_fname",
+        help="Input file of npl tagged file (crf output)",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--iOntoFile",
+        dest="onto_fname",
+        help="Input file with the ontology entities",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--iLinksFile",
+        dest="links_fname",
+        help="Input file with links and id for the ontology",
+        metavar="FILE",
+        default=None)
+    parser.add_option(
+         "--iSynFile",
+        dest="syn_fname",
+        help="Input file for the additional ontology of synonyms",
+        metavar="FILE",
+        default=None)
+    parser.add_option(
+         "--outputPath",
+        dest="output_path",
+        help="Output path to place output files",
+        metavar="PATH")
+    parser.add_option(
+         "--outputFile",
+        dest="out_fname",
+        help="Output file name for the mapping process",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--minPerMatch",
+        dest="min_score",
+        help="Minimal string matching percentage")  
+    parser.add_option(
+         "--minCRFProbs",
+        dest="min_probs",
+        help="Minimal crf probabilities")
+    
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    #################### DISP PARAMETERS ####################
+    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
+    print("--inputPath      Path of npl tagged file: " + str(options.input_path))
+    print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
+    print("--iOntoFile      Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
+    print("--iLinksFile     Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
+    print("--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
+    print("--outputPath     Output path to place output files: " + str(options.output_path))
+    print("--outputFile     Output of the mapping process: " + str(options.out_fname))
+    print("--minPerMatch    Minimal string matching percentage: " + str(options.min_score))
+    print("--minCRFProbs    Minimal crf probabilities allowed: " + str(options.min_probs))
+
+    print("\n\n")
+    repognrl = "http://pakal.ccg.unam.mx/cmendezc"
+    reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
+    repo_url = '/'.join([repognrl,reponame])
+    
+    # Input files ========================================================================================
+    min_score = int(options.min_score)
+    min_probs = float(options.min_probs)
+    npl_ifile =  os.path.join(options.input_path, options.npl_fname)
+    mco_ifile =  os.path.join(options.input_path, options.onto_fname)
+    mco_syn_ifile =  os.path.join(options.input_path, options.syn_fname)
+
+    # Output files =======================================================================================
+
+    #Save by mapping stratergy
+    raw_ofname = "_".join(["raw", options.out_fname])
+    rawmap_ofile =  os.path.join(options.output_path, raw_ofname)
+    str_ofname = "_".join(["sim", options.out_fname])
+    strmap_ofile =  os.path.join(options.output_path, str_ofname)
+
+    #Saving map und unmap
+    full_map_ofile = os.path.join(options.output_path, "full_map_"+options.out_fname)
+    full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
+
+    #Save JSONs
+    json_ofile = os.path.join(options.output_path, options.out_fname)
+    json_ofile_map  = json_ofile.replace(".tsv", "_map.json")
+    json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
+    json_ofile_full = json_ofile.replace(".tsv", "_full.json")
+    
+    # Load input data ====================================================================================
+    
+    #Columns for the NPL-CRF extraction
+    exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
+    
+    #Load CRF-annotation
+    npl_full = read_table(npl_ifile,  sep = "\t")
+    
+    #Check input
+    obs_cols = set(npl_full.columns)
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        ocol = ", ".join(list(exp_cols))        
+        sys.exit(ocol + " expected columns for iAnnotatedFile" )
+
+    #Filter Input by probs
+    npl_df = npl_full[npl_full.PROB >= min_probs]
+    npl_df = npl_df.drop_duplicates(keep="first")
+    npl_df = npl_df.dropna()
+    
+    #Cleaning input
+    npl_df['TERM_TYPE'] = [mapping_fun.transterm_npl2mco(term) for term in npl_df.TERM_TYPE]     
+    #filter non-mco terms types
+    npl_df = npl_df[npl_df.TERM_TYPE != "exTag Type"]
+
+    #add repofile_ source. access to stored files at gitLab    
+    source_access = ['/'.join([repo_url,gse,gse+'.soft.gz']) for gse in npl_df['GSE']]
+    npl_df['REPO_FILE'] = source_access
+
+    ##remove additional spaces
+    npl_df['TERM_NAME'] = [txt.strip() for txt in npl_df['TERM_NAME']]
+    npl_df['PMID'] = [pmid.replace("PMID_", "") for pmid in npl_df['PMID']]
+
+    #Columns for MCO
+    exp_cols = {"TERM_ID", "TERM_NAME"}
+
+    #Load MCO term names
+    mco_df_full = read_table(mco_ifile,  sep = "\t")
+    
+    #Check input MCO
+    obs_cols = set(mco_df_full.columns)
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
+
+    #Clean MCO input        
+    mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
+    mco_df = mco_df.drop_duplicates(keep="first")
+    mco_df = mco_df.dropna()
+
+    #Load MCO links
+    if options.links_fname is not None:
+        print("\nLoad types...")
+        mcolink_ifile =  os.path.join(options.input_path, options.links_fname)
+        exp_cols = {"TERM_ID", "TERM_TYPE"}
+        mco_links_full = read_table(mcolink_ifile, sep = "\t")
+
+        obs_cols = set(mco_links_full.columns)
+
+        if exp_cols.intersection(obs_cols) != exp_cols:
+            sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
+
+        mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
+        mco_links = mco_links.drop_duplicates(keep="first")
+        mco_links = mco_links.dropna()
+    else:
+        mco_links = None
+
+    #Load MCO terms synonyms
+    mco_json = open(mco_syn_ifile )
+    
+    #format json from mco to dataframe
+    data = json.load(mco_json)
+    mco_syn = format_fun.json2DataFrame(data)
+
+
+    print('\n\n-------------------------------- INPUTS --------------------------------\n')
+
+    print("\nnpl tagged file\n")
+    print(npl_df.head(3))
+    print("\nontology entities\n")
+    print(mco_df.head(3))
+    if options.links_fname is not None:
+        print("\nlinks and id for the ontology (MCO-type-links)\n")
+        print(mco_links.head(3))
+    print("\nadditional ontology of synonyms (MCO-syn-json)\n")
+    print(mco_syn.head(3))
+
+
+    print('\n\n-------------------------------- RESULTS --------------------------------\n')
+   
+    #################### mappping to MCO exact string ####################
+    #npl_df = npl_df.drop_duplicates("TERM_NAME",  keep="first")
+    #npl_df = npl_df.head(10)
+    
+    print("\nTracking exact terms to MCO...")
+    print(f"\nMapping {len(npl_df.index)} terms to MCO based on exact strings...")
+    
+    #Exact mapping to MCO
+    raw_matches = mapping_fun.raw_map_mco(
+        npl_df = npl_df, 
+        mco_df = mco_df, 
+        mco_links = mco_links, 
+        unmap = True)
+
+    #save file name source of the raw mapping
+    raw_matches["SOURCE"] = mco_ifile
+    #additional column to merge
+    raw_matches["ENTITY_NAME"] = ""
+    
+    #################### mappping to MCO.syn exact string ####################
+    
+    #define unmapped
+    raw_mco_unmap = raw_matches[raw_matches.isna().TERM_ID]
+    #input for te second step
+    raw_mco_unmap = raw_mco_unmap[list(npl_df.columns)]
+
+    print(f"\nMapping {len(raw_mco_unmap.index)} terms to MCO - synonyms based on exact strings...\n")
+    
+    #exact mapping to synonims
+    raw_matches_syn = mapping_fun.raw_map_mco(
+        npl_df = raw_mco_unmap, 
+        mco_df = mco_syn, 
+        unmap = True)
+    
+    #additional column to merge
+    raw_matches_syn["SOURCE"] = mco_syn_ifile
+    #raw_matches_syn["TERM_TYPE"] = ""
+    
+    #################### save mapped terms based on exact strings ####################
+    
+    #all mapped
+    raw_map_odf = concat(
+        [raw_matches, raw_matches_syn], 
+        sort=True).dropna()
+    
+    #print(raw_map_odf.head(3))
+    print(f"Total of terms mapped by exact strings: {len(raw_map_odf.index)}")
+    print("Saving filtered terms from raw mapping...\n\n")
+    
+    raw_map_odf.to_csv(
+        rawmap_ofile, 
+        sep = "\t", 
+        header =True, 
+        index=False)
+
+    #################### unmmaped raw terms ####################     
+    raw_mco_syn_unmap =  raw_matches_syn[raw_matches_syn.isna().TERM_ID]
+    raw_mco_syn_unmap = raw_mco_syn_unmap[list(npl_df.columns)]
+        
+    print(f"{len(raw_mco_syn_unmap.index)} unmapped terms based on exact strings")
+    print("Dropping duplicated unmapped term names...")
+    raw_mco_syn_unmap = raw_mco_syn_unmap.drop_duplicates("TERM_NAME")        
+    
+    print(f"{len(raw_mco_syn_unmap.index)} unmapped unique terms based on exact strings")
+
+    #################### string similarity mapping ####################
+    ###Matching unmaped terms by string similarity   
+    print("\ncompute string similarty...")
+
+    print(f"\nMapping to MCO {len(raw_mco_syn_unmap.index)} terms based on string similarity...")
+
+    str_matches = mapping_fun.str_match_map_mco(
+        raw_mco_syn_unmap, mco_df, 
+        mco_links = mco_links,  
+        min_match=0, 
+        npl_merges=False)
+
+    str_matches_odf = str_matches[str_matches.SET >= min_score]
+    str_matches_odf["SOURCE"] = mco_ifile    
+
+    #################### unmmaped sim terms (MCO) ####################
+    str_mco_unmap = str_matches[str_matches.SET < min_score]
+    #str_mco_unmap = str_mco_unmap[list(npl_df.columns)]
+    str_mco_unmap = str_mco_unmap.drop_duplicates("TERM_NAME")
+
+    print(f"\nMapping to MCO - synonyms {len(str_mco_unmap.index)} terms based on string siilarity..\n")
+    str_matches_syn = mapping_fun.str_match_map_mco(
+        str_mco_unmap, mco_syn, 
+        min_match=min_score, 
+        npl_merges=False)
+
+    str_matches_syn_odf = str_matches_syn[str_matches_syn.SET >= min_score]
+    str_matches_syn_odf["SOURCE"] = mco_syn_ifile
+  
+    #################### save str-sim map terms ####################    
+    all_str_matches_odf =  concat(
+        [str_matches_odf, str_matches_syn_odf], 
+        sort = True).dropna()     
+        
+    print(f"Unique terms mapped by string similarity: {len(all_str_matches_odf.index)}")
+
+    all_str_matches_npl_odf = merge(
+        npl_df, all_str_matches_odf, 
+        on = ["TERM_NAME"], 
+        how="inner")    
+    
+    print(f"Total of terms mapped by string similarity: {len(all_str_matches_npl_odf.index)}")
+    print("Saving filtered terms from str mapping...\n\n")
+    
+    all_str_matches_npl_odf.to_csv(
+        strmap_ofile, 
+        sep = "\t", 
+        header =True, 
+        index=False)
+
+    #################### save all map terms ####################
+    raw_map_odf["CASE_MATCH"] = "MCO"
+    raw_map_odf["SET"] = 100
+    raw_map_odf["SORT"] = 100
+
+    full_map = concat(
+        [all_str_matches_npl_odf, raw_map_odf], 
+        sort = True)
+    full_map["MAP"]=True
+    
+    full_map.to_csv(full_map_ofile, 
+    sep = "\t", 
+    header =True, 
+    index=False)
+
+    print("--------------------END----------------------")
+    print(f"Total of terms mapped: {len(full_map.index)}\n")    
+    
+    ###################### Merge all unmapped ######################
+    full_unmap = merge(npl_df, full_map[["TERM_NAME", "TERM_ID"]], on = ["TERM_NAME"], how='left')
+    full_unmap = full_unmap[full_unmap.isna().TERM_ID]
+    #print(full_unmap.head(3))
+    
+    print(f"Total of terms unmapped: {len(full_unmap.index)}")
+    
+    full_unmap["SOURCE"] = ""
+    full_unmap["CASE_MATCH"] = ""
+    full_unmap["SET"] = 0
+    full_unmap["SORT"] = 0
+    full_unmap["MAP"]=False
+
+    full_unmap.to_csv(
+        full_unmap_ofile, 
+        sep = "\t", 
+        header =True, 
+        index=False)
+
+    #################### Formatting json ####################
+    
+    format_fun.to_json(
+        df = full_map,
+        source_info = "GEO", 
+        evidence_source = "NPL-CRF", 
+        ofname = json_ofile_map
+        )
+
+    
+    format_fun.to_json(
+        df = full_unmap,
+        source_info = "GEO", 
+        evidence_source = "NPL-CRF", 
+        ofname = json_ofile_unmap
+        )
+
+    #Merge output all
+    full_merge = concat([full_map, full_unmap], sort=True)
+    format_fun.to_json(
+        df = full_merge,        
+        source_info = "GEO", 
+        evidence_source = "NPL-CRF", 
+        ofname = json_ofile_full
+        )
+    
\ No newline at end of file
--- a/mapping_MCO/bin/mapping2MCO_v6.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/bin/mapping2MCO_v6.py 0 → 100755
View file @92c2d61
+# -*- coding: utf-8 -*-
+"""
+#Setup
+"""
+
+#################### Setup ####################
+from optparse import OptionParser
+import os
+from numpy.core.fromnumeric import sort
+from pandas import read_csv, DataFrame, merge, concat, read_table
+from numpy import exp, nan
+import seaborn as sns
+from numpy import mean
+ 
+import matplotlib.pyplot as plt 
+import matplotlib
+matplotlib.style.use('ggplot')
+# %matplotlib inline
+
+from collections import Counter, defaultdict
+import json
+
+from fuzzywuzzy import fuzz
+from fuzzywuzzy import process
+
+#import format_fun
+import format_fun_v6 as format_fun
+import mapping_fun
+import sys
+
+"""
+# input parameters
+--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
+--iOntoFile         gc_ontology_terms_v2.txt
+--iLinksFile        gc_ontology_terms_link_v2.txt  
+--iSynFile          mco_terms_v0.2.json
+--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
+--outputFile        all_srr_IV_mapped.tsv
+--minPerMatch       90
+
+
+#Example
+# nohup python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py --inputPath      /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile      gc_ontology_terms_v2.txt   --iSynFile       mco_terms_v0.2.json --outputPath     /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile     srr_htregulondb_v2.tsv --minPerMatch  80  --minCRFProbs 0.9 > /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/srr_htregulondb_mapping_report_v2.out &
+"""
+#################### Defining parameters ####################
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option(
+        "--inputPath",
+        dest="input_path",
+        help="Path of npl tagged file (crf output)",
+        metavar="PATH")
+    parser.add_option(
+         "--iAnnotatedFile",
+        dest="npl_fname",
+        help="Input file of npl tagged file (crf output)",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--iOntoFile",
+        dest="onto_fname",
+        help="Input file with the ontology entities",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--iLinksFile",
+        dest="links_fname",
+        help="Input file with links and id for the ontology",
+        metavar="FILE",
+        default=None)
+    parser.add_option(
+         "--iSynFile",
+        dest="syn_fname",
+        help="Input file for the additional ontology of synonyms",
+        metavar="FILE",
+        default=None)
+    parser.add_option(
+         "--outputPath",
+        dest="output_path",
+        help="Output path to place output files",
+        metavar="PATH")
+    parser.add_option(
+         "--outputFile",
+        dest="out_fname",
+        help="Output file name for the mapping process",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--minPerMatch",
+        dest="min_score",
+        help="Minimal string matching percentage")  
+    parser.add_option(
+         "--minCRFProbs",
+        dest="min_probs",
+        help="Minimal crf probabilities")
+    
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    #################### DISP PARAMETERS ####################
+    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
+    print("--inputPath      Path of npl tagged file: " + str(options.input_path))
+    print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
+    print("--iOntoFile      Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
+    print("--iLinksFile     Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
+    print("--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
+    print("--outputPath     Output path to place output files: " + str(options.output_path))
+    print("--outputFile     Output of the mapping process: " + str(options.out_fname))
+    print("--minPerMatch    Minimal string matching percentage: " + str(options.min_score))
+    print("--minCRFProbs    Minimal crf probabilities allowed: " + str(options.min_probs))
+
+    print("\n\n")
+    repognrl = "http://pakal.ccg.unam.mx/cmendezc"
+    reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
+    repo_url = '/'.join([repognrl,reponame])
+    
+    # Input files ========================================================================================
+    min_score = int(options.min_score)
+    min_probs = float(options.min_probs)
+    npl_ifile =  os.path.join(options.input_path, options.npl_fname)
+    mco_ifile =  os.path.join(options.input_path, options.onto_fname)
+    mco_syn_ifile =  os.path.join(options.input_path, options.syn_fname)
+
+    # Output files =======================================================================================
+
+    #Save by mapping stratergy
+    raw_ofname = "_".join(["raw", options.out_fname])
+    rawmap_ofile =  os.path.join(options.output_path, raw_ofname)
+    str_ofname = "_".join(["sim", options.out_fname])
+    strmap_ofile =  os.path.join(options.output_path, str_ofname)
+
+    #Saving map und unmap
+    full_map_ofile = os.path.join(options.output_path, "full_map_"+options.out_fname)
+    full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
+
+    #Save JSONs
+    json_ofile = os.path.join(options.output_path, options.out_fname)
+    json_ofile_map  = json_ofile.replace(".tsv", "_map.json")
+    json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
+    json_ofile_full = json_ofile.replace(".tsv", "_full.json")
+    
+    # Load input data ====================================================================================
+    
+    #Columns for the NPL-CRF extraction
+    exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
+    
+    #Load CRF-annotation
+    npl_full = read_table(npl_ifile,  sep = "\t")
+    
+    #Check input
+    obs_cols = set(npl_full.columns)
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        ocol = ", ".join(list(exp_cols))        
+        sys.exit(ocol + " expected columns for iAnnotatedFile" )
+
+    #Filter Input by probs
+    npl_df = npl_full[npl_full.PROB >= min_probs]
+    npl_df = npl_df.drop(columns=["PROB"])
+    npl_df = npl_df.drop_duplicates(keep="first")
+    npl_df = npl_df.dropna()
+    
+    #Cleaning input
+    npl_df['TERM_TYPE'] = [mapping_fun.transterm_npl2mco(term) for term in npl_df.TERM_TYPE]     
+    #filter non-mco terms types
+    npl_df = npl_df[npl_df.TERM_TYPE != "exTag Type"]
+
+    #add repofile_ source. access to stored files at gitLab    
+    source_access = ['/'.join([repo_url,gse,gse+'.soft.gz']) for gse in npl_df['GSE']]
+    npl_df['REPO_FILE'] = source_access
+
+    ##remove additional spaces
+    npl_df['TERM_NAME'] = [txt.strip() for txt in npl_df['TERM_NAME']]
+    npl_df['PMID'] = [pmid.replace("PMID_", "") for pmid in npl_df['PMID']]
+
+    #Columns for MCO
+    exp_cols = {"TERM_ID", "TERM_NAME"}
+
+    #Load MCO term names
+    mco_df_full = read_table(mco_ifile,  sep = "\t")
+    
+    #Check input MCO
+    obs_cols = set(mco_df_full.columns)
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
+
+    #Clean MCO input        
+    mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
+    mco_df = mco_df.drop_duplicates(keep="first")
+    mco_df = mco_df.dropna()
+
+    #Load MCO links
+    if options.links_fname is not None:
+        print("\nLoad types...")
+        mcolink_ifile =  os.path.join(options.input_path, options.links_fname)
+        exp_cols = {"TERM_ID", "TERM_TYPE"}
+        mco_links_full = read_table(mcolink_ifile, sep = "\t")
+
+        obs_cols = set(mco_links_full.columns)
+
+        if exp_cols.intersection(obs_cols) != exp_cols:
+            sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
+
+        mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
+        mco_links = mco_links.drop_duplicates(keep="first")
+        mco_links = mco_links.dropna()
+    else:
+        mco_links = None
+
+    #Load MCO terms synonyms
+    mco_json = open(mco_syn_ifile )
+    
+    #format json from mco to dataframe
+    data = json.load(mco_json)
+    mco_syn = format_fun.json2DataFrame(data)
+
+
+    print('\n\n-------------------------------- INPUTS --------------------------------\n')
+
+    print("\nnpl tagged file\n")
+    print(npl_df.head(3))
+    print("\nontology entities\n")
+    print(mco_df.head(3))
+    if options.links_fname is not None:
+        print("\nlinks and id for the ontology (MCO-type-links)\n")
+        print(mco_links.head(3))
+    print("\nadditional ontology of synonyms (MCO-syn-json)\n")
+    print(mco_syn.head(3))
+
+
+    print('\n\n-------------------------------- RESULTS --------------------------------\n')
+   
+    #################### mappping to MCO exact string ####################
+    #npl_df = npl_df.drop_duplicates("TERM_NAME",  keep="first")
+    #npl_df = npl_df.head(10)
+    
+    print("\nTracking exact terms to MCO...")
+    print(f"\nMapping {len(npl_df.index)} terms to MCO based on exact strings...")
+    
+    #Exact mapping to MCO
+    raw_matches = mapping_fun.raw_map_mco(
+        npl_df = npl_df, 
+        mco_df = mco_df, 
+        mco_links = mco_links, 
+        unmap = True)
+
+    #save file name source of the raw mapping
+    raw_matches["SOURCE"] = mco_ifile
+    #additional column to merge
+    raw_matches["ENTITY_NAME"] = ""
+    
+    #################### mappping to MCO.syn exact string ####################
+    
+    #define unmapped
+    raw_mco_unmap = raw_matches[raw_matches.isna().TERM_ID]
+    #input for te second step
+    raw_mco_unmap = raw_mco_unmap[list(npl_df.columns)]
+
+    print(f"\nMapping {len(raw_mco_unmap.index)} terms to MCO - synonyms based on exact strings...\n")
+    
+    #exact mapping to synonims
+    raw_matches_syn = mapping_fun.raw_map_mco(
+        npl_df = raw_mco_unmap, 
+        mco_df = mco_syn, 
+        unmap = True)
+    
+    #additional column to merge
+    raw_matches_syn["SOURCE"] = mco_syn_ifile
+    #raw_matches_syn["TERM_TYPE"] = ""
+    
+    #################### save mapped terms based on exact strings ####################
+    
+    #all mapped
+    raw_map_odf = concat(
+        [raw_matches, raw_matches_syn], 
+        sort=True).dropna()
+    
+    #print(raw_map_odf.head(3))
+    print(f"Total of terms mapped by exact strings: {len(raw_map_odf.index)}")
+    print("Saving filtered terms from raw mapping...\n\n")
+    
+    raw_map_odf.to_csv(
+        rawmap_ofile, 
+        sep = "\t", 
+        header =True, 
+        index=False)
+
+    #################### unmmaped raw terms ####################     
+    raw_mco_syn_unmap =  raw_matches_syn[raw_matches_syn.isna().TERM_ID]
+    raw_mco_syn_unmap = raw_mco_syn_unmap[list(npl_df.columns)]
+        
+    print(f"{len(raw_mco_syn_unmap.index)} unmapped terms based on exact strings")
+    print("Dropping duplicated unmapped term names...")
+    raw_mco_syn_unmap = raw_mco_syn_unmap.drop_duplicates("TERM_NAME")        
+    
+    print(f"{len(raw_mco_syn_unmap.index)} unmapped unique terms based on exact strings")
+
+    #################### string similarity mapping ####################
+    ###Matching unmaped terms by string similarity   
+    print("\ncompute string similarty...")
+
+    print(f"\nMapping to MCO {len(raw_mco_syn_unmap.index)} terms based on string similarity...")
+
+    str_matches = mapping_fun.str_match_map_mco(
+        raw_mco_syn_unmap, mco_df, 
+        mco_links = mco_links,  
+        min_match=0, 
+        npl_merges=False)
+
+    str_matches_odf = str_matches[str_matches.SET >= min_score]
+    str_matches_odf["SOURCE"] = mco_ifile    
+
+    #################### unmmaped sim terms (MCO) ####################
+    str_mco_unmap = str_matches[str_matches.SET < min_score]
+    #str_mco_unmap = str_mco_unmap[list(npl_df.columns)]
+    str_mco_unmap = str_mco_unmap.drop_duplicates("TERM_NAME")
+
+    print(f"\nMapping to MCO - synonyms {len(str_mco_unmap.index)} terms based on string siilarity..\n")
+    str_matches_syn = mapping_fun.str_match_map_mco(
+        str_mco_unmap, mco_syn, 
+        min_match=min_score, 
+        npl_merges=False)
+
+    str_matches_syn_odf = str_matches_syn[str_matches_syn.SET >= min_score]
+    str_matches_syn_odf["SOURCE"] = mco_syn_ifile
+  
+    #################### save str-sim map terms ####################    
+    all_str_matches_odf =  concat(
+        [str_matches_odf, str_matches_syn_odf], 
+        sort = True).dropna()     
+        
+    print(f"Unique terms mapped by string similarity: {len(all_str_matches_odf.index)}")
+
+    all_str_matches_npl_odf = merge(
+        npl_df, all_str_matches_odf, 
+        on = ["TERM_NAME"], 
+        how="inner")    
+    
+    print(f"Total of terms mapped by string similarity: {len(all_str_matches_npl_odf.index)}")
+    print("Saving filtered terms from str mapping...\n\n")
+    
+    all_str_matches_npl_odf.to_csv(
+        strmap_ofile, 
+        sep = "\t", 
+        header =True, 
+        index=False)
+
+    #################### save all map terms ####################
+    raw_map_odf["CASE_MATCH"] = "MCO"
+    raw_map_odf["SET"] = 100
+    raw_map_odf["SORT"] = 100
+
+    full_map = concat(
+        [all_str_matches_npl_odf, raw_map_odf], 
+        sort = True)
+    full_map["MAP"]=True
+    
+    full_map.to_csv(full_map_ofile, 
+    sep = "\t", 
+    header =True, 
+    index=False)
+
+    print("--------------------END----------------------")
+    print(f"Total of terms mapped: {len(full_map.index)}\n")    
+    
+    ###################### Merge all unmapped ######################
+    full_unmap = merge(npl_df, full_map[["TERM_NAME", "TERM_ID"]], on = ["TERM_NAME"], how='left')
+    full_unmap = full_unmap[full_unmap.isna().TERM_ID]
+    #print(full_unmap.head(3))
+    
+    print(f"Total of terms unmapped: {len(full_unmap.index)}")
+    
+    full_unmap["SOURCE"] = ""
+    full_unmap["CASE_MATCH"] = ""
+    full_unmap["SET"] = 0
+    full_unmap["SORT"] = 0
+    full_unmap["MAP"]=False
+
+    full_unmap.to_csv(
+        full_unmap_ofile, 
+        sep = "\t", 
+        header =True, 
+        index=False)
+
+    #################### Formatting json ####################
+    
+    format_fun.to_json(
+        df = full_map,
+        source_info = "GEO", 
+        evidence_source = "NPL-CRF", 
+        ofname = json_ofile_map
+        )
+
+    
+    format_fun.to_json(
+        df = full_unmap,
+        source_info = "GEO", 
+        evidence_source = "NPL-CRF", 
+        ofname = json_ofile_unmap
+        )
+
+    #Merge output all
+    full_merge = concat([full_map, full_unmap], sort=True)
+    format_fun.to_json(
+        df = full_merge,        
+        source_info = "GEO", 
+        evidence_source = "NPL-CRF", 
+        ofname = json_ofile_full
+        )
+    
--- a/mapping_MCO/bin/step1_filter_curated_terms.py 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/bin/step1_filter_curated_terms.py 0 → 100644
View file @92c2d61
+from pandas import read_csv, merge
+crf_output_file = "/home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/srr_htregulondb/srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv"
+annot_file = "/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/correct_gc_terms_07_rev_Victor.csv"
+filter_ofile = "/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv"
+
+annot = read_csv(annot_file, names = ["TERM_TYPE", "TERM_NAME"] )
+annot.TERM_NAME = [text.strip() for text in annot.TERM_NAME]
+crf_ouput = read_csv(crf_output_file, 
+                     names = ["SRR","GSE","GSM","GPL","PMID","GSM_NAME",
+                              "GSE_NAME","GPL_NAME","BANGLINE",
+                              "SOURCE_TEXT_CTRL","FULL_TEXT","TERM_NAME",
+                              "TERM_TYPE","PROB"], 
+                     sep = "\t")
+crf_ouput.TERM_NAME = [text.strip() for text in crf_ouput.TERM_NAME]
+
+paso1 = merge( annot,crf_ouput, on =  ["TERM_TYPE", "TERM_NAME"] )
+paso1 = paso1.reindex(columns = ["SRR","GSE","GSM","GPL","PMID","GSM_NAME",
+                              "GSE_NAME","GPL_NAME","BANGLINE",
+                              "SOURCE_TEXT_CTRL","FULL_TEXT","TERM_NAME",
+                              "TERM_TYPE","PROB"])
+
+paso1.to_csv(filter_ofile,sep="\t",index=False,header=True)
--- a/mapping_MCO/input/No_GSM_Metadata_Selected_v4.tsv 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/input/No_GSM_Metadata_Selected_v4.tsv 0 → 100755
View file @92c2d61
--- a/mapping_MCO/input/correct_gc_terms_07_rev_Victor.csv 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/input/correct_gc_terms_07_rev_Victor.csv 0 → 100755
View file @92c2d61
+GC_type,GC_term
+Agit,250 rpm
+Air,Aerobic
+Air,Aerobic and anaerobic
+Air,aerobically
+Air,anaerobic
+Gtype,{ delta } baeR
+Gtype,{ delta } cpxR
+Gtype,{ delta } cspABCEG
+Gtype,{ delta } cspABEG
+Gtype,{ delta } cspBG
+Gtype,{ delta } hns
+Gtype,{ delta } kdpE
+Gtype,{ delta } nusG
+Gtype,{ delta } perC : : kanR
+Gtype,{ delta } phoB
+Gtype,{ delta } rho
+Gtype,{ delta } rnr
+Gtype,{ delta } zraR
+Gtype,: φO104
+Gtype,DH5α ( pAR060302 )
+Gtype,E.coli K12 BW25113
+Gtype,K12 MG1655
+Gtype,K12 MG1655 deltaprfC
+Gtype,K12 MG1655 prfB-Bstrain allele
+Gtype,K12 MG1655 prfB-Bstrain allele deltaprfC
+Gtype,K12 MG1657
+Gtype,K12 MG1667
+Gtype,K12 MG1668
+Gtype,K12 MG1672
+Gtype,K12 MG1673
+Gtype,K12 MG1674
+Gtype,K12 W3110
+Gtype,MC4100 ∆ tig : : kan pTig-TEV-Avi
+Gtype,O157 : H7 NCTC 12900
+Gtype,PNPase mutant
+Gtype,Pck over-expressed
+Gtype,RNase II mutant
+Gtype,RNase R mutant
+Gtype,W3110 6xHis-rpoD
+Gtype,W3110 6xHis-rpoD greA : : tet greB : : amp
+Gtype,"W3110 rpoC-6xHis : : kan greA : : tet , greB : : amp"
+Gtype,WT
+Gtype,WT WT
+Gtype,Wild type
+Gtype,Wild-type
+Gtype,Wildtype
+Gtype,cra KO ; BW25113 Dcra
+Gtype,cya mutant background
+Gtype,delta Crp
+Gtype,delta _ cra
+Gtype,delta-gadE
+Gtype,delta-gadW
+Gtype,delta-gadX
+Gtype,delta-oxyR
+Gtype,delta-soxR
+Gtype,delta-soxS
+Gtype,fepA knockout
+Gtype,fis mutant background
+Gtype,lacA knockout
+Gtype,lack Fis protein
+Gtype,lack H-NS protein
+Gtype,naive ( wild type )
+Gtype,ompR deletion mutant
+Gtype,phageO104 in the wrbA gene
+Gtype,phagePA8 in the argW gene
+Gtype,rng mutant
+Gtype,sdhC knockout
+Gtype,sigma70 WT
+Gtype,wild type
+Gtype,wild type ; MG1655
+Gtype,wild-type
+Gtype,wildtype
+Gtype,wt
+Gtype,yafC deletion
+Gtype,ybaO deletion
+Gtype,ybaQ deletion
+Gtype,ybiH deletion
+Gtype,ydcI deletion
+Gtype,yddM deletion
+Gtype,yeiE deletion
+Gtype,yheO deletion
+Gtype,yiaJ deletion
+Gtype,yieP deletion
+Gtype,Δcra
+Gtype,Δfur
+Gtype,ΔgadE
+Gtype,ΔgadW
+Gtype,ΔgadX
+Gtype,ΔoxyR
+Gtype,ΔsoxR
+Gtype,ΔsoxS
+Gtype,∆ cspABCEG
+Gtype,∆ cspABEG
+Gtype,∆ cspBG
+Gtype,∆ hfq : : cat )
+Gtype,∆ rnr
+Med,Bertani ( LB ) medium
+Med,Davis Minimal medium
+Med,LB
+Med,LB media
+Med,LB medium
+Med,"LB medium ,"
+Med,M9 + 4 g/L glc ( glucose minimal media )
+Med,M9 minimal media
+Med,M9 minimal medium
+Med,MOPS complete-glucose liquid media
+Med,MOPS glucose minimal medium
+Med,MOPS medium
+Med,Neidhardt MOPS Minimal Medium ( NM3 )
+Med,SB medium
+Med,SILAC
+Med,W2 minimal media
+Med,fresh DM500
+Med,fully supplemented MOPS glucose media
+Med,glucose-M9 minimal media
+Med,glucose-limited minimal medium
+Med,in fresh LB medium
+Med,minimal medium
+OD,O.D. 600nm 0.5
+OD,OD600 = 0.3
+OD,OD600 of about 0.8
+Phase,IspG1 strain
+Phase,exponential
+Phase,log phase
+Phase,log phase sample
+Phase,mid-log phase
+Phase,stationary
+Phase,stationary phase
+Supp,0.1 mM KCl
+Supp,0.2 % arabinose
+Supp,0.2 % glucose
+Supp,0.2 % glutamine
+Supp,0.2 mM of DPD
+Supp,0.3 % glucose
+Supp,0.3 M of NaCl
+Supp,0.4 % glucose
+Supp,0.5 % glucose
+Supp,100 μM IPTG
+Supp,1mM IPTG
+Supp,2 mM Hydrogen peroxide
+Supp,22 mM glucose
+Supp,250 uM of paraquat
+Supp,2g/L glucose
+Supp,2g/L glucose and 1 mM cytidine
+Supp,4g/L glucose
+Supp,50 µM NiCl2
+Supp,70 µM IPTG
+Supp,DPD
+Supp,Fe
+Supp,IPTG
+Supp,IPTG was
+Supp,L-trp
+Supp,Xgal and IPTG
+Supp,acetate
+Supp,ade
+Supp,arabinose
+Supp,fructose
+Supp,glucose
+Supp,glutamine
+Supp,induced 50 µM IPTG
+Supp,mM IPTG
+Supp,mM IPTG + 50μg/ml Amp
+Supp,rhamnose
+Supp,rifampicin
+Supp,rifampicin and
+Supp,rifampicin time point
+Supp,rifampicin time point 0
+Supp,rifampicin time point 4
+Supp,rifampicin time point 6
+Supp,rifampicin time point 8
+Temp,10 °C
+Temp,30 °C
+Temp,37 °C
+Temp,37 ℃
+Temp,42 °C
+pH,pH 5.5
+pH,pH5 .5
--- a/mapping_MCO/input/format_zika_v3.py 0 → 100755
View file @92c2d61
+++ b/mapping_MCO/input/format_zika_v3.py 0 → 100755
View file @92c2d61
+# -*- coding: utf-8 -*-
+"""
+#Setup
+"""
+
+#################### Setup ####################
+from collections import defaultdict
+from optparse import OptionParser
+import os
+from numpy.core.fromnumeric import sort
+from pandas import read_csv, DataFrame, merge, concat, read_table
+from numpy import exp, nan
+import seaborn as sns
+from numpy import mean
+ 
+import matplotlib.pyplot as plt 
+import matplotlib
+matplotlib.style.use('ggplot')
+# %matplotlib inline
+
+from collections import Counter
+import json
+
+from fuzzywuzzy import fuzz
+from fuzzywuzzy import process
+
+import format_fun
+import mapping_fun
+import sys
+
+"""
+# input parameters
+--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
+--iOntoFile         gc_ontology_terms_v2.txt
+--iLinksFile        gc_ontology_terms_link_v2.txt  
+--iSynFile          mco_terms_v0.2.json
+--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
+--outputFile        all_srr_IV_mapped.tsv
+--minPerMatch       90
+
+
+#Example
+# nohup python3 mapping2MCO_v3.py --inputPath      /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile      gc_ontology_terms_v2.txt   --iSynFile       mco_terms_v0.2.json --outputPath     /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile     srr_htregulondb_mapped.tsv --minPerMatch  80  --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
+"""
+#################### Defining parameters ####################
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option(
+        "--inputPath",
+        dest="input_path",
+        help="Path of npl tagged file (crf output)",
+        metavar="PATH")
+    parser.add_option(
+         "--iAnnotatedFile",
+        dest="npl_fname",
+        help="Input file of npl tagged file (crf output)",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--iOntoFile",
+        dest="onto_fname",
+        help="Input file with the ontology entities",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--iLinksFile",
+        dest="links_fname",
+        help="Input file with links and id for the ontology",
+        metavar="FILE",
+        default=None)
+    parser.add_option(
+         "--iSynFile",
+        dest="syn_fname",
+        help="Input file for the additional ontology of synonyms",
+        metavar="FILE",
+        default=None)
+    parser.add_option(
+         "--outputPath",
+        dest="output_path",
+        help="Output path to place output files",
+        metavar="PATH")
+    parser.add_option(
+         "--outputFile",
+        dest="out_fname",
+        help="Output file name for the mapping process",
+        metavar="FILE",
+        default="")
+    parser.add_option(
+         "--minPerMatch",
+        dest="min_score",
+        help="Minimal string matching percentage")  
+    parser.add_option(
+         "--minCRFProbs",
+        dest="min_probs",
+        help="Minimal crf probabilities")
+    
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    #################### DISP PARAMETERS ####################
+    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
+    print("--inputPath      Path of npl tagged file: " + str(options.input_path))
+    print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
+    print("--iOntoFile      Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
+    print("--iLinksFile     Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
+    print("--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
+    print("--outputPath     Output path to place output files: " + str(options.output_path))
+    print("--outputFile     Output of the mapping process: " + str(options.out_fname))
+    print("--minPerMatch    Minimal string matching percentage: " + str(options.min_score))
+    print("--minCRFProbs    Minimal crf probabilities allowed: " + str(options.min_probs))
+
+    print("\n\n")
+    repognrl = "http://pakal.ccg.unam.mx/cmendezc"
+    reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
+    repo_url = '/'.join([repognrl,reponame])
+    
+    # Input files
+    min_score = int(options.min_score)
+    min_probs = float(options.min_probs)
+    npl_ifile =  os.path.join(options.input_path, options.npl_fname)
+    mco_ifile =  os.path.join(options.input_path, options.onto_fname)
+    mco_syn_ifile =  os.path.join(options.input_path, options.syn_fname)
+
+    #Output files
+    raw_ofname = "_".join(["raw", options.out_fname])
+    rawmap_ofile =  os.path.join(options.output_path, raw_ofname)
+    str_ofname = "_".join(["sim", options.out_fname])
+    strmap_ofile =  os.path.join(options.output_path, str_ofname)
+
+    full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
+    full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
+
+    json_ofile = os.path.join(options.output_path, options.out_fname)
+    json_ofile_map  = json_ofile.replace(".tsv", "_map.json")
+    json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
+    
+    #################### Load input data ####################
+    # Load CRF-annotation
+    exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
+    npl_full = read_table(npl_ifile,  sep = "\t")
+        
+    obs_cols = set(npl_full.columns)
+
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        ocol = ", ".join(list(exp_cols))        
+        sys.exit(ocol + " expected columns for iAnnotatedFile" )
+
+    #Load MCO term names
+    exp_cols = {"TERM_ID", "TERM_NAME"}
+    mco_df_full = read_table(mco_ifile,  sep = "\t")
+    obs_cols = set(mco_df_full.columns)
+
+    if exp_cols.intersection(obs_cols) != exp_cols:
+        sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
+        
+    mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
+    mco_df = mco_df.drop_duplicates(keep="first")
+    mco_df = mco_df.dropna()
+
+    #Load MCO links
+    if options.links_fname is not None:
+        print("\nLoad types...")
+        mcolink_ifile =  os.path.join(options.input_path, options.links_fname)
+        exp_cols = {"TERM_ID", "TERM_TYPE"}
+        mco_links_full = read_table(mcolink_ifile, sep = "\t")
+
+        obs_cols = set(mco_links_full.columns)
+
+        if exp_cols.intersection(obs_cols) != exp_cols:
+            sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
+
+        mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
+        mco_links = mco_links.drop_duplicates(keep="first")
+        mco_links = mco_links.dropna()
+    else:
+        mco_links = None
+
+    #Load MCO terms synonyms
+    #format json from mco to dataframe
+    mco_json = open(mco_syn_ifile )
+    data = json.load(mco_json)
+    mco_syn = format_fun.json2DataFrame(data)
+    
+    df_json = defaultdict(list)
+        
+    for idx,row in full_unmap.iterrows():
+        record = format_fun.created_record(row), output)
+        df_json[row.SRR].append(record)
+    
+    df_json
+    with open(json_ofile_list, "w") as output:
+        json.dump(format_fun.created_record(df_json), output)
+
+    with open(json_ofile_df_list, "a") as output:
+        for idx,row in df_json.items():
+            json.dump(format_fun.created_record(row), output)
\ No newline at end of file
--- a/mapping_MCO/input/srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/input/srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/srr_htregulondb_mapped_map.json deleted 100644 → 0
View file @afa29bd
+++ b/mapping_MCO/output/srr_htregulondb_mapped_map.json deleted 100644 → 0
View file @afa29bd
--- a/mapping_MCO/output/srr_htregulondb_mapped_unmap.json deleted 100644 → 0
View file @afa29bd
+++ b/mapping_MCO/output/srr_htregulondb_mapped_unmap.json deleted 100644 → 0
View file @afa29bd
--- a/mapping_MCO/output/full_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/full_map_srr_htregulondb.tsv
View file @92c2d61
+++ b/mapping_MCO/output/full_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/full_map_srr_htregulondb.tsv
View file @92c2d61
--- a/mapping_MCO/output/full_unmap_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/full_unmap_srr_htregulondb.tsv
View file @92c2d61
+++ b/mapping_MCO/output/full_unmap_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/full_unmap_srr_htregulondb.tsv
View file @92c2d61
--- a/mapping_MCO/output/sim_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/sim_srr_htregulondb.tsv
View file @92c2d61
+++ b/mapping_MCO/output/sim_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3.1/sim_srr_htregulondb.tsv
View file @92c2d61
--- a/mapping_MCO/output/v3.1/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3.1/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3.1/srr_htregulondb_mapping_report.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/srr_htregulondb_mapping_report.out 0 → 100644
View file @92c2d61
+/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
+  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
+/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame.
+Try using .loc[row_indexer,col_indexer] = value instead
+
+See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+  str_matches_odf["SOURCE"] = mco_ifile
+
+
+-------------------------------- PARAMETERS --------------------------------
+
+--inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
+--iOntoFile      Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
+--iLinksFile     Input file with links and id for the ontology (MCO-type-links): None
+--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
+--outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
+--outputFile     Output of the mapping process: srr_htregulondb.tsv
+--minPerMatch    Minimal string matching percentage: 80
+--minCRFProbs    Minimal crf probabilities allowed: 0.9
+
+
+
+
+
+-------------------------------- INPUTS --------------------------------
+
+
+npl tagged file
+
+          SRR                        ...                                                                  REPO_FILE
+0  SRR5742248                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+5  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+7  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+
+[3 rows x 15 columns]
+
+ontology entities
+
+        TERM_ID                         TERM_NAME
+0  MCO000000014  generically dependent continuant
+1  MCO000000015                         radiation
+2  MCO000000016         electromagnetic radiation
+
+additional ontology of synonyms (MCO-syn-json)
+
+                   ENTITY_NAME       TERM_ID       TERM_NAME
+MCO000000019        continuant  MCO000000019                
+MCO000002475    culture medium  MCO000002475                
+MCO000002467_0        Organism  MCO000002467  biologicentity
+
+
+-------------------------------- RESULTS --------------------------------
+
+
+Tracking exact terms to MCO...
+
+Mapping 4099 terms to MCO based on exact strings...
+
+Mapping 3770 terms to MCO - synonyms based on exact strings...
+
+Total of terms mapped by exact strings: 387
+Saving filtered terms from raw mapping...
+
+
+3712 unmapped terms based on exact strings
+Dropping duplicated unmapped term names...
+206 unmapped unique terms based on exact strings
+
+compute string similarty...
+
+Mapping to MCO 206 terms based on string similarity...
+
+Mapping to MCO - synonyms 152 terms based on string siilarity..
+
+Unique terms mapped by string similarity: 73
+Total of terms mapped by string similarity: 1992
+Saving filtered terms from str mapping...
+
+
+--------------------END----------------------
+Total of terms mapped: 2379
+
+Total of terms unmapped: 1720
--- a/mapping_MCO/output/v3.1/srr_htregulondb_mapping_report_v3.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/srr_htregulondb_mapping_report_v3.out 0 → 100644
View file @92c2d61
+/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
+  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
+/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame.
+Try using .loc[row_indexer,col_indexer] = value instead
+
+See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+  str_matches_odf["SOURCE"] = mco_ifile
+
+
+-------------------------------- PARAMETERS --------------------------------
+
+--inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
+--iOntoFile      Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
+--iLinksFile     Input file with links and id for the ontology (MCO-type-links): None
+--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
+--outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
+--outputFile     Output of the mapping process: srr_htregulondb.tsv
+--minPerMatch    Minimal string matching percentage: 80
+--minCRFProbs    Minimal crf probabilities allowed: 0.9
+
+
+
+
+
+-------------------------------- INPUTS --------------------------------
+
+
+npl tagged file
+
+          SRR                        ...                                                                  REPO_FILE
+0  SRR5742248                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+5  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+7  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+
+[3 rows x 15 columns]
+
+ontology entities
+
+        TERM_ID                         TERM_NAME
+0  MCO000000014  generically dependent continuant
+1  MCO000000015                         radiation
+2  MCO000000016         electromagnetic radiation
+
+additional ontology of synonyms (MCO-syn-json)
+
+                   ENTITY_NAME       TERM_ID       TERM_NAME
+MCO000000019        continuant  MCO000000019                
+MCO000002475    culture medium  MCO000002475                
+MCO000002467_0        Organism  MCO000002467  biologicentity
+
+
+-------------------------------- RESULTS --------------------------------
+
+
+Tracking exact terms to MCO...
+
+Mapping 4099 terms to MCO based on exact strings...
+
+Mapping 3770 terms to MCO - synonyms based on exact strings...
+
+Total of terms mapped by exact strings: 387
+Saving filtered terms from raw mapping...
+
+
+3712 unmapped terms based on exact strings
+Dropping duplicated unmapped term names...
+206 unmapped unique terms based on exact strings
+
+compute string similarty...
+
+Mapping to MCO 206 terms based on string similarity...
+
+Mapping to MCO - synonyms 152 terms based on string siilarity..
+
+Unique terms mapped by string similarity: 73
+Total of terms mapped by string similarity: 1992
+Saving filtered terms from str mapping...
+
+
+--------------------END----------------------
+Total of terms mapped: 2379
+
+Total of terms unmapped: 1720
--- a/mapping_MCO/output/v3.1/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3.1/zika.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/zika.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3.1/zika_v3.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3.1/zika_v3.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3/full_map_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/full_map_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3/full_unmap_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/full_unmap_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/raw_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3/raw_srr_htregulondb.tsv
View file @92c2d61
+++ b/mapping_MCO/output/raw_srr_htregulondb_mapped.tsv → mapping_MCO/output/v3/raw_srr_htregulondb.tsv
View file @92c2d61
--- a/mapping_MCO/output/v3/sim_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/sim_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v3/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v3/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/full_map_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/full_map_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/full_unmap_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/full_unmap_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/raw_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/raw_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/sim_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/sim_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_full.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_full.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_map.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_map.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_unmap.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_unmap.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/curated/srr_htregulondb_mapping_report_curated_terms.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/curated/srr_htregulondb_mapping_report_curated_terms.out 0 → 100644
View file @92c2d61
+/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
+  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
+/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v6.py:313: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame.
+Try using .loc[row_indexer,col_indexer] = value instead
+
+See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+  str_matches_odf["SOURCE"] = mco_ifile
+
+
+-------------------------------- PARAMETERS --------------------------------
+
+--inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv
+--iOntoFile      Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
+--iLinksFile     Input file with links and id for the ontology (MCO-type-links): None
+--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
+--outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v4/curated/
+--outputFile     Output of the mapping process: srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv
+--minPerMatch    Minimal string matching percentage: 80
+--minCRFProbs    Minimal crf probabilities allowed: 0.9
+
+
+
+
+
+-------------------------------- INPUTS --------------------------------
+
+
+npl tagged file
+
+           SRR                        ...                                                                  REPO_FILE
+0    SRR771533                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+2    SRR771534                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+24  SRR3194453                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+
+[3 rows x 14 columns]
+
+ontology entities
+
+        TERM_ID                         TERM_NAME
+0  MCO000000014  generically dependent continuant
+1  MCO000000015                         radiation
+2  MCO000000016         electromagnetic radiation
+
+additional ontology of synonyms (MCO-syn-json)
+
+                   ENTITY_NAME       TERM_ID       TERM_NAME
+MCO000000019        continuant  MCO000000019                
+MCO000002475    culture medium  MCO000002475                
+MCO000002467_0        Organism  MCO000002467  biologicentity
+
+
+-------------------------------- RESULTS --------------------------------
+
+
+Tracking exact terms to MCO...
+
+Mapping 2149 terms to MCO based on exact strings...
+
+Mapping 1820 terms to MCO - synonyms based on exact strings...
+
+Total of terms mapped by exact strings: 387
+Saving filtered terms from raw mapping...
+
+
+1762 unmapped terms based on exact strings
+Dropping duplicated unmapped term names...
+104 unmapped unique terms based on exact strings
+
+compute string similarty...
+
+Mapping to MCO 104 terms based on string similarity...
+
+Mapping to MCO - synonyms 61 terms based on string siilarity..
+
+Unique terms mapped by string similarity: 58
+Total of terms mapped by string similarity: 1570
+Saving filtered terms from str mapping...
+
+
+--------------------END----------------------
+Total of terms mapped: 1957
+
+Total of terms unmapped: 192
--- a/mapping_MCO/output/v4/full_map_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/full_map_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/full_unmap_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/full_unmap_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/raw_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/raw_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/sim_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/sim_srr_htregulondb.tsv 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/srr_htregulondb_full.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/srr_htregulondb_map.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/output/v4/srr_htregulondb_mapping_report_v3.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/srr_htregulondb_mapping_report_v3.out 0 → 100644
View file @92c2d61
+/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
+  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
+/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame.
+Try using .loc[row_indexer,col_indexer] = value instead
+
+See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+  str_matches_odf["SOURCE"] = mco_ifile
+
+
+-------------------------------- PARAMETERS --------------------------------
+
+--inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
+--iOntoFile      Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
+--iLinksFile     Input file with links and id for the ontology (MCO-type-links): None
+--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
+--outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v3/
+--outputFile     Output of the mapping process: srr_htregulondb.tsv
+--minPerMatch    Minimal string matching percentage: 80
+--minCRFProbs    Minimal crf probabilities allowed: 0.9
+
+
+
+
+
+-------------------------------- INPUTS --------------------------------
+
+
+npl tagged file
+
+          SRR                        ...                                                                  REPO_FILE
+0  SRR5742248                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+5  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+7  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+
+[3 rows x 15 columns]
+
+ontology entities
+
+        TERM_ID                         TERM_NAME
+0  MCO000000014  generically dependent continuant
+1  MCO000000015                         radiation
+2  MCO000000016         electromagnetic radiation
+
+additional ontology of synonyms (MCO-syn-json)
+
+                   ENTITY_NAME       TERM_ID       TERM_NAME
+MCO000000019        continuant  MCO000000019                
+MCO000002475    culture medium  MCO000002475                
+MCO000002467_0        Organism  MCO000002467  biologicentity
+
+
+-------------------------------- RESULTS --------------------------------
+
+
+Tracking exact terms to MCO...
+
+Mapping 4099 terms to MCO based on exact strings...
+
+Mapping 3770 terms to MCO - synonyms based on exact strings...
+
+Total of terms mapped by exact strings: 387
+Saving filtered terms from raw mapping...
+
+
+3712 unmapped terms based on exact strings
+Dropping duplicated unmapped term names...
+206 unmapped unique terms based on exact strings
+
+compute string similarty...
+
+Mapping to MCO 206 terms based on string similarity...
+
+Mapping to MCO - synonyms 152 terms based on string siilarity..
+
+Unique terms mapped by string similarity: 73
+Total of terms mapped by string similarity: 1992
+Saving filtered terms from str mapping...
+
+
+--------------------END----------------------
+Total of terms mapped: 2379
+
+Total of terms unmapped: 1720
--- a/mapping_MCO/output/v4/srr_htregulondb_mapping_report_v4.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/srr_htregulondb_mapping_report_v4.out 0 → 100644
View file @92c2d61
+/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
+  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
+/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v6.py:313: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame.
+Try using .loc[row_indexer,col_indexer] = value instead
+
+See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+  str_matches_odf["SOURCE"] = mco_ifile
+
+
+-------------------------------- PARAMETERS --------------------------------
+
+--inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
+--iOntoFile      Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
+--iLinksFile     Input file with links and id for the ontology (MCO-type-links): None
+--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
+--outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v4/
+--outputFile     Output of the mapping process: srr_htregulondb.tsv
+--minPerMatch    Minimal string matching percentage: 80
+--minCRFProbs    Minimal crf probabilities allowed: 0.9
+
+
+
+
+
+-------------------------------- INPUTS --------------------------------
+
+
+npl tagged file
+
+          SRR                        ...                                                                  REPO_FILE
+0  SRR5742248                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+5  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+7  SRR5742250                        ...                          http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
+
+[3 rows x 14 columns]
+
+ontology entities
+
+        TERM_ID                         TERM_NAME
+0  MCO000000014  generically dependent continuant
+1  MCO000000015                         radiation
+2  MCO000000016         electromagnetic radiation
+
+additional ontology of synonyms (MCO-syn-json)
+
+                   ENTITY_NAME       TERM_ID       TERM_NAME
+MCO000000019        continuant  MCO000000019                
+MCO000002475    culture medium  MCO000002475                
+MCO000002467_0        Organism  MCO000002467  biologicentity
+
+
+-------------------------------- RESULTS --------------------------------
+
+
+Tracking exact terms to MCO...
+
+Mapping 3769 terms to MCO based on exact strings...
+
+Mapping 3440 terms to MCO - synonyms based on exact strings...
+
+Total of terms mapped by exact strings: 387
+Saving filtered terms from raw mapping...
+
+
+3382 unmapped terms based on exact strings
+Dropping duplicated unmapped term names...
+206 unmapped unique terms based on exact strings
+
+compute string similarty...
+
+Mapping to MCO 206 terms based on string similarity...
+
+Mapping to MCO - synonyms 152 terms based on string siilarity..
+
+Unique terms mapped by string similarity: 73
+Total of terms mapped by string similarity: 1668
+Saving filtered terms from str mapping...
+
+
+--------------------END----------------------
+Total of terms mapped: 2055
+
+Total of terms unmapped: 1714
--- a/mapping_MCO/output/v4/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/output/v4/srr_htregulondb_unmap.json 0 → 100644
View file @92c2d61
--- a/mapping_MCO/reports/zika_mapping_report.out 0 → 100644
View file @92c2d61
+++ b/mapping_MCO/reports/zika_mapping_report.out 0 → 100644
View file @92c2d61
+
+
+-------------------------------- PARAMETERS --------------------------------
+
+--inputPath      Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
+--iAnnotatedFile Input file of npl tagged file: No_GSM_Metadata_Selected_v4.tsv
+--outputPath     Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
+--outputFile     Output of the mapping process: zika.json
+
+
+
+Total zika terms: 2351