format_zika_v3.py 7.78 KB

Raw Blame History Permalink

# -*- coding: utf-8 -*-
"""
#Setup
"""

#################### Setup ####################
from collections import defaultdict
from optparse import OptionParser
import os
from numpy.core.fromnumeric import sort
from pandas import read_csv, DataFrame, merge, concat, read_table
from numpy import exp, nan
import seaborn as sns
from numpy import mean

import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
# %matplotlib inline

from collections import Counter
import json

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import format_fun
import mapping_fun
import sys

"""
# input parameters
--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
--iOntoFile         gc_ontology_terms_v2.txt
--iLinksFile        gc_ontology_terms_link_v2.txt
--iSynFile          mco_terms_v0.2.json
--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
--outputFile        all_srr_IV_mapped.tsv
--minPerMatch       90


#Example
# nohup python3 mapping2MCO_v3.py --inputPath      /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile      gc_ontology_terms_v2.txt   --iSynFile       mco_terms_v0.2.json --outputPath     /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile     srr_htregulondb_mapped.tsv --minPerMatch  80  --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
"""
#################### Defining parameters ####################
if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option(
        "--inputPath",
        dest="input_path",
        help="Path of npl tagged file (crf output)",
        metavar="PATH")
    parser.add_option(
         "--iAnnotatedFile",
        dest="npl_fname",
        help="Input file of npl tagged file (crf output)",
        metavar="FILE",
        default="")
    parser.add_option(
         "--iOntoFile",
        dest="onto_fname",
        help="Input file with the ontology entities",
        metavar="FILE",
        default="")
    parser.add_option(
         "--iLinksFile",
        dest="links_fname",
        help="Input file with links and id for the ontology",
        metavar="FILE",
        default=None)
    parser.add_option(
         "--iSynFile",
        dest="syn_fname",
        help="Input file for the additional ontology of synonyms",
        metavar="FILE",
        default=None)
    parser.add_option(
         "--outputPath",
        dest="output_path",
        help="Output path to place output files",
        metavar="PATH")
    parser.add_option(
         "--outputFile",
        dest="out_fname",
        help="Output file name for the mapping process",
        metavar="FILE",
        default="")
    parser.add_option(
         "--minPerMatch",
        dest="min_score",
        help="Minimal string matching percentage")
    parser.add_option(
         "--minCRFProbs",
        dest="min_probs",
        help="Minimal crf probabilities")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    #################### DISP PARAMETERS ####################
    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
    print("--inputPath      Path of npl tagged file: " + str(options.input_path))
    print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
    print("--iOntoFile      Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
    print("--iLinksFile     Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
    print("--iSynFile       Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
    print("--outputPath     Output path to place output files: " + str(options.output_path))
    print("--outputFile     Output of the mapping process: " + str(options.out_fname))
    print("--minPerMatch    Minimal string matching percentage: " + str(options.min_score))
    print("--minCRFProbs    Minimal crf probabilities allowed: " + str(options.min_probs))

    print("\n\n")
    repognrl = "http://pakal.ccg.unam.mx/cmendezc"
    reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
    repo_url = '/'.join([repognrl,reponame])

    # Input files
    min_score = int(options.min_score)
    min_probs = float(options.min_probs)
    npl_ifile =  os.path.join(options.input_path, options.npl_fname)
    mco_ifile =  os.path.join(options.input_path, options.onto_fname)
    mco_syn_ifile =  os.path.join(options.input_path, options.syn_fname)

    #Output files
    raw_ofname = "_".join(["raw", options.out_fname])
    rawmap_ofile =  os.path.join(options.output_path, raw_ofname)
    str_ofname = "_".join(["sim", options.out_fname])
    strmap_ofile =  os.path.join(options.output_path, str_ofname)

    full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
    full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)

    json_ofile = os.path.join(options.output_path, options.out_fname)
    json_ofile_map  = json_ofile.replace(".tsv", "_map.json")
    json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")

    #################### Load input data ####################
    # Load CRF-annotation
    exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
    npl_full = read_table(npl_ifile,  sep = "\t")

    obs_cols = set(npl_full.columns)

    if exp_cols.intersection(obs_cols) != exp_cols:
        ocol = ", ".join(list(exp_cols))
        sys.exit(ocol + " expected columns for iAnnotatedFile" )

    #Load MCO term names
    exp_cols = {"TERM_ID", "TERM_NAME"}
    mco_df_full = read_table(mco_ifile,  sep = "\t")
    obs_cols = set(mco_df_full.columns)

    if exp_cols.intersection(obs_cols) != exp_cols:
        sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )

    mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
    mco_df = mco_df.drop_duplicates(keep="first")
    mco_df = mco_df.dropna()

    #Load MCO links
    if options.links_fname is not None:
        print("\nLoad types...")
        mcolink_ifile =  os.path.join(options.input_path, options.links_fname)
        exp_cols = {"TERM_ID", "TERM_TYPE"}
        mco_links_full = read_table(mcolink_ifile, sep = "\t")

        obs_cols = set(mco_links_full.columns)

        if exp_cols.intersection(obs_cols) != exp_cols:
            sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )

        mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
        mco_links = mco_links.drop_duplicates(keep="first")
        mco_links = mco_links.dropna()
    else:
        mco_links = None

    #Load MCO terms synonyms
    #format json from mco to dataframe
    mco_json = open(mco_syn_ifile )
    data = json.load(mco_json)
    mco_syn = format_fun.json2DataFrame(data)

    df_json = defaultdict(list)

    for idx,row in full_unmap.iterrows():
        record = format_fun.created_record(row), output)
        df_json[row.SRR].append(record)

    df_json
    with open(json_ofile_list, "w") as output:
        json.dump(format_fun.created_record(df_json), output)

    with open(json_ofile_df_list, "a") as output:
        for idx,row in df_json.items():
            json.dump(format_fun.created_record(row), output)