mapping2MCO_v2.py 11.9 KB
# -*- coding: utf-8 -*-
"""
#Setup
"""

#################### Setup ####################
from optparse import OptionParser
import os
from numpy.core.fromnumeric import sort
from pandas import read_csv, DataFrame, merge, concat, read_table
from numpy import exp, nan
import seaborn as sns
from numpy import mean
 
import matplotlib.pyplot as plt 
import matplotlib
matplotlib.style.use('ggplot')
# %matplotlib inline

from collections import Counter
import json

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import format_fun
import mapping_fun
import sys

"""
# input parameters
--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
--iOntoFile         gc_ontology_terms_v2.txt
--iLinksFile        gc_ontology_terms_link_v2.txt  
--iSynFile          mco_terms_v0.2.json
--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
--outputFile        all_srr_IV_mapped.tsv
--minPerMatch       90


#Example
# nohup python3 mapping2MCO_v2.py --inputPath      /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv --iOntoFile      gc_ontology_terms_v2.txt   --iSynFile       mco_terms_v0.2.json --outputPath     /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile     all_srr_IV_mapped_v2.tsv --minPerMatch  80  --minCRFProbs 0.9 > ../reports/srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_report_v2.out &
"""
#################### Defining parameters ####################
if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option(
        "--inputPath",
        dest="input_path",
        help="Path of npl tagged file (crf output)",
        metavar="PATH")
    parser.add_option(
         "--iAnnotatedFile",
        dest="npl_fname",
        help="Input file of npl tagged file (crf output)",
        metavar="FILE",
        default="")
    parser.add_option(
         "--iOntoFile",
        dest="onto_fname",
        help="Input file with the ontology entities",
        metavar="FILE",
        default="")
    parser.add_option(
         "--iLinksFile",
        dest="links_fname",
        help="Input file with links and id for the ontology",
        metavar="FILE",
        default=None)
    parser.add_option(
         "--iSynFile",
        dest="syn_fname",
        help="Input file for the additional ontology of synonyms",
        metavar="FILE",
        default=None)
    parser.add_option(
         "--outputPath",
        dest="output_path",
        help="Output path to place output files",
        metavar="PATH")
    parser.add_option(
         "--outputFile",
        dest="out_fname",
        help="Output file name for the mapping process",
        metavar="FILE",
        default="")
    parser.add_option(
         "--minPerMatch",
        dest="min_score",
        help="Minimal string matching percentage")  
    parser.add_option(
         "--minCRFProbs",
        dest="min_probs",
        help="Minimal crf probabilities")
    
    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    #################### DISP PARAMETERS ####################
    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
    print("--inputPath          Path of npl tagged file (crf output): " + str(options.input_path))
    print("--iAnnotatedFile     Input file of npl tagged file (crf output: " + str(options.npl_fname))
    print("--iOntoFile          Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
    print("--iLinksFile         Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
    print("--iSynFile           Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
    print("--outputPath         Output path to place output files: " + str(options.output_path))
    print("--outputFile         Output of the mapping process: " + str(options.out_fname))
    print("--minPerMatch        Minimal string matching percentage: " + str(options.min_score))
    print("--minCRFProbs        Minimal crf probabilities allowed: " + str(options.min_probs))

    print("\n\n")
    repognrl = "http://pakal.ccg.unam.mx/cmendezc"
    reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/outputs/srr_galagan/"
    repo_url = '/'.join([repognrl,reponame])
    
    # Input files
    min_score = int(options.min_score)
    min_probs = float(options.min_probs)
    npl_ifile =  os.path.join(options.input_path, options.npl_fname)
    mco_ifile =  os.path.join(options.input_path, options.onto_fname)
    mco_syn_ifile =  os.path.join(options.input_path, options.syn_fname)

    #Output files
    raw_ofname = "_".join(["raw", options.out_fname])
    rawmap_ofile =  os.path.join(options.output_path, raw_ofname)
    str_ofname = "_".join(["sim", options.out_fname])
    strmap_ofile =  os.path.join(options.output_path, str_ofname)

    

    #################### Load input data ####################
    # Load CRF-annotation
    exp_cols = {"GSE", "GSM", "GPL_PMID", "FULLTEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
    npl_full = read_table(npl_ifile,  sep = "\t")
    
    obs_cols = set(npl_full.columns)

    
    if exp_cols.intersection(obs_cols) != exp_cols:
        ocol = ", ".join(list(exp_cols))        
        sys.exit(ocol + " expected columns for iAnnotatedFile" )

    npl_df = npl_full[npl_full.PROB >= min_probs]
    npl_df = npl_df.drop_duplicates(keep="first")
    npl_df = npl_df.dropna()

    #Cleaning input
    npl_df['TERM_TYPE'] = [mapping_fun.transterm_npl2mco(term) for term in npl_df.TERM_TYPE] 
    #extract PMID from GPL_PMID column
    npl_df['PMID'] =  [r.split(":")[-1] for r in npl_df.GPL_PMID]
    #banglines base name
    npl_df['BANGLINE'] = [n[:-2] for n in npl_df.BANGLINE] 

    #add repofile source. access to stored files at gitLab
    url_iterm = zip(npl_df.GSE, npl_df.GSM, npl_df.GPL_PMID)
    source_access = ['/'.join([repo_url,r[0],'-'.join(r),'.tsv']) for r in url_iterm]
    npl_df['REPOFILE'] = source_access

    ##remove additional spaces
    npl_df['TERM_NAME'] = [txt.strip() for txt in npl_df['TERM_NAME']]


    #Load MCO term names
    exp_cols = {"TERM_ID", "TERM_NAME"}
    mco_df_full = read_table(mco_ifile,  sep = "\t")
    obs_cols = set(mco_df_full.columns)

    if exp_cols.intersection(obs_cols) != exp_cols:
        sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
        
    mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
    mco_df = mco_df.drop_duplicates(keep="first")
    mco_df = mco_df.dropna()

    #Load MCO links
    if options.links_fname is not None:
        print("\nLoad types...")
        mcolink_ifile =  os.path.join(options.input_path, options.links_fname)
        exp_cols = {"TERM_ID", "TERM_TYPE"}
        mco_links_full = read_table(mcolink_ifile, sep = "\t")

        obs_cols = set(mco_links_full.columns)

        if exp_cols.intersection(obs_cols) != exp_cols:
            sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )

        mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
        mco_links = mco_links.drop_duplicates(keep="first")
        mco_links = mco_links.dropna()
    else:
        mco_links = None

    #Load MCO terms synonyms
    #format json from mco to dataframe
    mco_json = open(mco_syn_ifile )
    data = json.load(mco_json)
    mco_syn = format_fun.json2DataFrame(data)


    print('\n\n-------------------------------- INPUTS --------------------------------\n')


    print("\nnpl tagged file\n")
    print(npl_df.head(3))
    print("\nontology entities\n")
    print(mco_df.head(3))
    if options.links_fname is not None:
        print("\nlinks and id for the ontology (MCO-type-links)\n")
        print(mco_links.head(3))
    print("\nadditional ontology of synonyms (MCO-syn-json)\n")
    print(mco_syn.head(3))


    print('\n\n-------------------------------- RESULTS --------------------------------\n')
   
    #################### raw mappping ####################
    n_npl = len(npl_df.index)
    print(f"\nMapping {n_npl} terms to MCO based on exact strings...\n")
    raw_matches = mapping_fun.raw_map_mco(npl_df = npl_df, mco_df = mco_df, mco_links = mco_links, unmap = True)
    #save file name source of the raw mapping
    raw_matches["SOURCE"] = mco_ifile
    #additional column to merge
    raw_matches["ENTITY_NAME"] = ""
    
    raw_map_unmap = raw_matches[raw_matches.isna().TERM_ID]
    n_raw_unmap = len(raw_map_unmap.index)
    print(f"\nMapping {n_raw_unmap} terms to MCO - synonyms based on exact strings...\n")
    raw_matches_syn = mapping_fun.raw_map_mco(npl_df = npl_df, mco_df = mco_syn, unmap = True)
    raw_matches_syn["SOURCE"] = mco_syn_ifile
    raw_matches_syn["TERM_TYPE"] = ""
    
    
    #################### save raw map terms ####################
    raw_map_odf = concat([raw_matches, raw_matches_syn], sort=True).dropna()
    print(raw_map_odf.head(3))
    n_raw_map = len(raw_map_odf.index)
    print(f"Total of terms mapped by exact strings: {n_raw_map}")
    print("Saving filtered terms from raw mapping...\n\n")
    raw_map_odf.to_csv(rawmap_ofile, sep = "\t", header =True, index=False)

    #################### unmmaped raw terms #################### 
    #print(raw_matches.isna().TERM_ID)
    npl_unmap_df = concat(
        [raw_matches[raw_matches.isna().TERM_ID], 
        raw_matches_syn[raw_matches_syn.isna().TERM_ID]],
        sort=True)

    npl_unmap_df = npl_unmap_df[list(npl_df.columns)]
    
    n_unmaped = len(npl_unmap_df.index)
    print(f"\n{n_unmaped} unmapped terms based on exact strings")
    print("Dropping duplicated unmapped term names...\n")
    npl_unmap_df = npl_unmap_df.drop_duplicates("TERM_NAME")    
    n_unmaped = len(npl_unmap_df.index)
    print(f"{n_unmaped} unmapped unique terms based on exact strings\n")

    #################### string similarity mapping ####################
    ###Matching unmaped term names    
    print(f"\nMapping to MCO {n_unmaped} terms based on string similarity...\n")

    str_matches = mapping_fun.str_match_map_mco(npl_unmap_df.head(50), mco_df, mco_links = mco_links,  min_match=0, npl_merges=False)
    str_matches_odf = str_matches[str_matches.SET >= min_score]
    str_matches_odf["SOURCE"] = mco_ifile    

    #################### unmmaped sim terms (MCO) ####################
    npl_unmap_str_df = str_matches[str_matches.SET <= min_score]
    #npl_unmap_str_df = npl_unmap_str_df[list(npl_df.columns)]
    npl_unmap_str_df = npl_unmap_str_df.drop_duplicates("TERM_NAME")

    print(f"\nMapping to MCO - synonyms {n_unmaped} terms based on string siilarity..\n")
    str_matches_syn = mapping_fun.str_match_map_mco(npl_unmap_str_df.head(50), mco_syn, min_match=min_score, npl_merges=False)
    str_matches_syn_odf = str_matches_syn
    str_matches_syn_odf["SOURCE"] = mco_syn_ifile
  
    #################### save str-sim map terms ####################    
    all_str_matches_odf =  concat([str_matches_odf, str_matches_syn_odf], sort = True).dropna()     
    n_str_map = len(all_str_matches_odf.index)
    print(f"Unique terms mapped by string similarity: {n_str_map}")

    all_str_matches_npl_odf = merge(npl_df, all_str_matches_odf, on = ["TERM_NAME"], how="inner")    
    n_str_map = len(all_str_matches_npl_odf.index)
    print(f"Total of terms mapped by string similarity: {n_str_map}")
    print("Saving filtered terms from str mapping...")
    all_str_matches_npl_odf.to_csv(strmap_ofile, sep = "\t", header =True, index=False)