format_zika_v5.py 4.63 KB
# -*- coding: utf-8 -*-
"""
#Setup
"""

#################### Setup ####################
from optparse import OptionParser
import os
from pandas import read_csv, DataFrame, merge, concat, read_table
from numpy import mean
import format_fun_v6 as format_fun
import sys

"""
# input parameters
--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
--outputFile        all_srr_IV_mapped.tsv


#Example
# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath  /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile  zika.json > automatic-extraction-growth-conditions/mapping_MCO/reports/zika_formated_report.out
# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath  /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/test/ --outputFile  zika_mapped.json > automatic-extraction-growth-conditions/mapping_MCO/test/zika_mapping_report.out

"""
#################### Defining parameters ####################
if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option(
        "--inputPath",
        dest="input_path",
        help="Path of npl tagged file (crf output)",
        metavar="PATH")
    parser.add_option(
         "--iAnnotatedFile",
        dest="npl_fname",
        help="Input file of npl tagged file (crf output)",
        metavar="FILE",
        default="")
    parser.add_option(
         "--outputPath",
        dest="output_path",
        help="Output path to place output files",
        metavar="PATH")
    parser.add_option(
         "--outputFile",
        dest="out_fname",
        help="Output file name for the mapping process",
        metavar="FILE",
        default="")
    
    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    #################### DISP PARAMETERS ####################
    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
    print("--inputPath      Path of npl tagged file: " + str(options.input_path))
    print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
    print("--outputPath     Output path to place output files: " + str(options.output_path))
    print("--outputFile     Output of the mapping process: " + str(options.out_fname))    

    print("\n\n")
    
    # Input files        
    npl_ifile =  os.path.join(options.input_path, options.npl_fname)

    #Output files
    ofname = os.path.join(options.output_path, options.out_fname)
    
    #################### Load input data ####################
    # Load CRF-annotation
    exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
    npl_full = read_table(npl_ifile,  sep = "\t")
    npl_full = npl_full.drop_duplicates()

    print(f"Total zika terms: {len(npl_full)} ")
    obs_cols = set(npl_full.columns)

    if exp_cols.intersection(obs_cols) != exp_cols:
        ocol = ", ".join(list(exp_cols))        
        sys.exit(ocol + " expected columns for iAnnotatedFile" )
    """
    df_terms = defaultdict(list)
        
    for idx,row in npl_full.iterrows():
        term_record = format_fun.get_term_info(row, source = "ZIKAdb", map=False)        
        df_terms[row.SRR].append(term_record)        
    
    df_json = {}
    df_tmp = npl_full.drop_duplicates("SRR", keep="first")
    for idx,row in df_tmp.iterrows():        
        srr_record = format_fun.created_record(
            info_df = row, 
            term_list = df_terms[row.SRR],
            source = "ZIKAdb",             
            esource = "database")        
        df_json[row.SRR] = srr_record

    with open(ofname, "w") as output:
        json.dump(df_json, output, separators=(',', ':'), indent=4)

    df=open(ofname)
    df=json.load(df)
    print(df["ERR1399578"])
    """
    npl_full["MAP"] = False
    format_fun.to_json(
        df = npl_full,        
        source_info = "ZIKAdb",
        evidence_source = "database", 
        ofname = ofname)