mapping2MCO_v3.py 13.5 KB
# -*- coding: utf-8 -*-
"""
#Setup
"""

#################### Setup ####################
from optparse import OptionParser
import os
from numpy.core.fromnumeric import sort
from pandas import read_csv, DataFrame, merge, concat, read_table
from numpy import exp, nan
import seaborn as sns
from numpy import mean
 
import matplotlib.pyplot as plt 
import matplotlib
matplotlib.style.use('ggplot')
# %matplotlib inline

from collections import Counter
import json

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

#import format_fun
import format_fun_v4 as format_fun
import mapping_fun
import sys

"""
# input parameters
--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
--iOntoFile         gc_ontology_terms_v2.txt
--iLinksFile        gc_ontology_terms_link_v2.txt  
--iSynFile          mco_terms_v0.2.json
--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
--outputFile        all_srr_IV_mapped.tsv
--minPerMatch       90


#Example
# nohup python3 mapping2MCO_v3.py --inputPath      /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile      gc_ontology_terms_v2.txt   --iSynFile       mco_terms_v0.2.json --outputPath     /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile     srr_htregulondb_mapped.tsv --minPerMatch  80  --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
"""
#################### Defining parameters ####################
if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option(
        "--inputPath",
        dest="input_path",
        help="Path of npl tagged file (crf output)",
        metavar="PATH")
    parser.add_option(
         "--iAnnotatedFile",
        dest="npl_fname",
        help="Input file of npl tagged file (crf output)",
        metavar="FILE",
        default="")
    parser.add_option(
         "--iOntoFile",
        dest="onto_fname",
        help="Input file with the ontology entities",
        metavar="FILE",
        default="")
    parser.add_option(
         "--iLinksFile",
        dest="links_fname",
        help="Input file with links and id for the ontology",
        metavar="FILE",
        default=None)
    parser.add_option(
         "--iSynFile",
        dest="syn_fname",
        help="Input file for the additional ontology of synonyms",
        metavar="FILE",
        default=None)
    parser.add_option(
         "--outputPath",
        dest="output_path",
        help="Output path to place output files",
        metavar="PATH")
    parser.add_option(
         "--outputFile",
        dest="out_fname",
        help="Output file name for the mapping process",
        metavar="FILE",
        default="")
    parser.add_option(
         "--minPerMatch",
        dest="min_score",
        help="Minimal string matching percentage")  
    parser.add_option(
         "--minCRFProbs",
        dest="min_probs",
        help="Minimal crf probabilities")
    
    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    #################### DISP PARAMETERS ####################
    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
    print("--inputPath      Path of npl tagged file: " + str(options.input_path))
    print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
    print("--iOntoFile      Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
    print("--iLinksFile     Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
    print("--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
    print("--outputPath     Output path to place output files: " + str(options.output_path))
    print("--outputFile     Output of the mapping process: " + str(options.out_fname))
    print("--minPerMatch    Minimal string matching percentage: " + str(options.min_score))
    print("--minCRFProbs    Minimal crf probabilities allowed: " + str(options.min_probs))

    print("\n\n")
    repognrl = "http://pakal.ccg.unam.mx/cmendezc"
    reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
    repo_url = '/'.join([repognrl,reponame])
    
    # Input files
    min_score = int(options.min_score)
    min_probs = float(options.min_probs)
    npl_ifile =  os.path.join(options.input_path, options.npl_fname)
    mco_ifile =  os.path.join(options.input_path, options.onto_fname)
    mco_syn_ifile =  os.path.join(options.input_path, options.syn_fname)

    #Output files
    raw_ofname = "_".join(["raw", options.out_fname])
    rawmap_ofile =  os.path.join(options.output_path, raw_ofname)
    str_ofname = "_".join(["sim", options.out_fname])
    strmap_ofile =  os.path.join(options.output_path, str_ofname)

    full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
    full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)

    json_ofile = os.path.join(options.output_path, options.out_fname)
    json_ofile_map  = json_ofile.replace(".tsv", "_map.json")
    json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
    
    #################### Load input data ####################
    # Load CRF-annotation
    exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
    npl_full = read_table(npl_ifile,  sep = "\t")
    
    obs_cols = set(npl_full.columns)

    if exp_cols.intersection(obs_cols) != exp_cols:
        ocol = ", ".join(list(exp_cols))        
        sys.exit(ocol + " expected columns for iAnnotatedFile" )

    npl_df = npl_full[npl_full.PROB >= min_probs]
    npl_df = npl_df.drop_duplicates(keep="first")
    npl_df = npl_df.dropna()
    

    #Cleaning input
    npl_df['TERM_TYPE'] = [mapping_fun.transterm_npl2mco(term) for term in npl_df.TERM_TYPE]     
    #filter non-mco terms types
    npl_df = npl_df[npl_df.TERM_TYPE != "exTag Type"]
     

    #add repofile_ source. access to stored files at gitLab    
    source_access = ['/'.join([repo_url,gse,gse+'.soft.gz']) for gse in npl_df['GSE']]
    npl_df['REPO_FILE'] = source_access
    
    ##remove additional spaces
    npl_df['TERM_NAME'] = [txt.strip() for txt in npl_df['TERM_NAME']]
    npl_df['PMID'] = [pmid.replace("PMID_", "") for pmid in npl_df['PMID']]


    #Load MCO term names
    exp_cols = {"TERM_ID", "TERM_NAME"}
    mco_df_full = read_table(mco_ifile,  sep = "\t")
    obs_cols = set(mco_df_full.columns)

    if exp_cols.intersection(obs_cols) != exp_cols:
        sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
        
    mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
    mco_df = mco_df.drop_duplicates(keep="first")
    mco_df = mco_df.dropna()

    #Load MCO links
    if options.links_fname is not None:
        print("\nLoad types...")
        mcolink_ifile =  os.path.join(options.input_path, options.links_fname)
        exp_cols = {"TERM_ID", "TERM_TYPE"}
        mco_links_full = read_table(mcolink_ifile, sep = "\t")

        obs_cols = set(mco_links_full.columns)

        if exp_cols.intersection(obs_cols) != exp_cols:
            sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )

        mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
        mco_links = mco_links.drop_duplicates(keep="first")
        mco_links = mco_links.dropna()
    else:
        mco_links = None

    #Load MCO terms synonyms
    #format json from mco to dataframe
    mco_json = open(mco_syn_ifile )
    data = json.load(mco_json)
    mco_syn = format_fun.json2DataFrame(data)


    print('\n\n-------------------------------- INPUTS --------------------------------\n')


    print("\nnpl tagged file\n")
    print(npl_df.head(3))
    print("\nontology entities\n")
    print(mco_df.head(3))
    if options.links_fname is not None:
        print("\nlinks and id for the ontology (MCO-type-links)\n")
        print(mco_links.head(3))
    print("\nadditional ontology of synonyms (MCO-syn-json)\n")
    print(mco_syn.head(3))


    print('\n\n-------------------------------- RESULTS --------------------------------\n')
   
    #################### mappping to MCO exact string ####################
        
    print(f"\nMapping {len(npl_df.index)} terms to MCO based on exact strings...\n")
    
    #first mapping
    raw_matches = mapping_fun.raw_map_mco(
        npl_df = npl_df, 
        mco_df = mco_df, 
        mco_links = mco_links, 
        unmap = True)

    #save file name source of the raw mapping
    raw_matches["SOURCE"] = mco_ifile
    #additional column to merge
    raw_matches["ENTITY_NAME"] = ""
    
    #################### mappping to MCO.syn exact string ####################
    
    #define unmapped
    raw_mco_unmap = raw_matches[raw_matches.isna().TERM_ID]
    #input for te second step
    raw_mco_unmap = raw_mco_unmap[list(npl_df.columns)]

    print(f"\nMapping {len(raw_mco_unmap.index)} terms to MCO - synonyms based on exact strings...\n")
    
    #second mapping
    raw_matches_syn = mapping_fun.raw_map_mco(
        npl_df = raw_mco_unmap, 
        mco_df = mco_syn, 
        unmap = True)
    
    #additional column to merge
    raw_matches_syn["SOURCE"] = mco_syn_ifile
    #raw_matches_syn["TERM_TYPE"] = ""
    
    #################### save mapped terms based on exact strings ####################
    
    #all mapped
    raw_map_odf = concat([raw_matches, raw_matches_syn], sort=True).dropna()
    
    print(raw_map_odf.head(3))    
    print(f"Total of terms mapped by exact strings: {len(raw_map_odf.index)}")
    print("Saving filtered terms from raw mapping...\n\n")
    
    raw_map_odf.to_csv(rawmap_ofile, sep = "\t", header =True, index=False)

    #################### unmmaped raw terms ####################     
    raw_mco_syn_unmap =  raw_matches_syn[raw_matches_syn.isna().TERM_ID]
    raw_mco_syn_unmap = raw_mco_syn_unmap[list(npl_df.columns)]
        
    print(f"\n{len(raw_mco_syn_unmap.index)} unmapped terms based on exact strings")
    print("Dropping duplicated unmapped term names...\n")
    raw_mco_syn_unmap = raw_mco_syn_unmap.drop_duplicates("TERM_NAME")        
    
    print(f"{len(raw_mco_syn_unmap.index)} unmapped unique terms based on exact strings\n")

    #################### string similarity mapping ####################
    ###Matching unmaped term names    
    print(f"\nMapping to MCO {len(raw_mco_syn_unmap.index)} terms based on string similarity...\n")

    str_matches = mapping_fun.str_match_map_mco(raw_mco_syn_unmap, mco_df, mco_links = mco_links,  min_match=0, npl_merges=False)
    str_matches_odf = str_matches[str_matches.SET >= min_score]
    str_matches_odf["SOURCE"] = mco_ifile    

    #################### unmmaped sim terms (MCO) ####################
    str_mco_unmap = str_matches[str_matches.SET < min_score]
    #str_mco_unmap = str_mco_unmap[list(npl_df.columns)]
    str_mco_unmap = str_mco_unmap.drop_duplicates("TERM_NAME")

    print(f"\nMapping to MCO - synonyms {len(str_mco_unmap.index)} terms based on string siilarity..\n")
    str_matches_syn = mapping_fun.str_match_map_mco(str_mco_unmap, mco_syn, min_match=min_score, npl_merges=False)
    str_matches_syn_odf = str_matches_syn[str_matches_syn.SET >= min_score]
    str_matches_syn_odf["SOURCE"] = mco_syn_ifile
  
    #################### save str-sim map terms ####################    
    all_str_matches_odf =  concat([str_matches_odf, str_matches_syn_odf], sort = True).dropna()     
        
    print(f"Unique terms mapped by string similarity: {len(all_str_matches_odf.index)}")

    all_str_matches_npl_odf = merge(npl_df, all_str_matches_odf, on = ["TERM_NAME"], how="inner")    
    
    print(f"Total of terms mapped by string similarity: {len(all_str_matches_npl_odf.index)}")
    print("Saving filtered terms from str mapping...")
    
    all_str_matches_npl_odf.to_csv(strmap_ofile, sep = "\t", header =True, index=False)

    #################### Formatting json ####################
    raw_map_odf["CASE_MATCH"] = "MCO"
    raw_map_odf["SET"] = 100
    raw_map_odf["SORT"] = 100

    full_map = concat([all_str_matches_npl_odf, raw_map_odf], sort = True)
    
    full_map.to_csv(full_ofile, sep = "\t", header =True, index=False)

    print(f"Total of terms mapped: {len(full_map.index)}")
    
    with open(json_ofile_map, "a") as output:
        for idx,row in full_map.iterrows():
            json.dump(format_fun.created_record(row), output)

    
    full_unmap = merge(npl_df, full_map[["TERM_NAME", "TERM_ID"]], on = ["TERM_NAME"], how='left')
    full_unmap = full_unmap[full_unmap.isna().TERM_ID]
    print(full_unmap.head(3))
    
    print(f"Total of terms unmapped: {len(full_unmap.index)}")
    
    full_unmap["SOURCE"] = ""
    full_unmap["CASE_MATCH"] = ""
    full_unmap["SET"] = 0
    full_unmap["SORT"] = 0

    full_unmap.to_csv(full_unmap_ofile, sep = "\t", header =True, index=False)

    with open(json_ofile_unmap, "a") as output:
        for idx,row in full_unmap.iterrows():
            json.dump(format_fun.created_record(row), output)