format_zika_v4.py 4.24 KB
# -*- coding: utf-8 -*-
"""
#Setup
"""

#################### Setup ####################
from collections import defaultdict
from optparse import OptionParser
import os
from pandas import read_csv, DataFrame, merge, concat, read_table
from numpy import exp, nan, mean
import json
import format_fun_v4 as format_fun
import sys

"""
# input parameters
--inputPath         /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile    srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
--outputPath        /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
--outputFile        all_srr_IV_mapped.tsv


#Example
# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v4.py --inputPath  /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v3.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile  zika_mapped.json
"""
#################### Defining parameters ####################
if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option(
        "--inputPath",
        dest="input_path",
        help="Path of npl tagged file (crf output)",
        metavar="PATH")
    parser.add_option(
         "--iAnnotatedFile",
        dest="npl_fname",
        help="Input file of npl tagged file (crf output)",
        metavar="FILE",
        default="")
    parser.add_option(
         "--outputPath",
        dest="output_path",
        help="Output path to place output files",
        metavar="PATH")
    parser.add_option(
         "--outputFile",
        dest="out_fname",
        help="Output file name for the mapping process",
        metavar="FILE",
        default="")
    
    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    #################### DISP PARAMETERS ####################
    print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
    print("--inputPath      Path of npl tagged file: " + str(options.input_path))
    print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
    print("--outputPath     Output path to place output files: " + str(options.output_path))
    print("--outputFile     Output of the mapping process: " + str(options.out_fname))    

    print("\n\n")
    
    # Input files        
    npl_ifile =  os.path.join(options.input_path, options.npl_fname)

    #Output files
    ofname = os.path.join(options.output_path, options.out_fname)
    
    #################### Load input data ####################
    # Load CRF-annotation
    exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
    npl_full = read_table(npl_ifile,  sep = "\t")
        
    obs_cols = set(npl_full.columns)

    if exp_cols.intersection(obs_cols) != exp_cols:
        ocol = ", ".join(list(exp_cols))        
        sys.exit(ocol + " expected columns for iAnnotatedFile" )
    
    df_json = defaultdict(list)
        
    for idx,row in npl_full.iterrows():
        record = format_fun.created_record(row, source = "ZIKAdb", no_map = True, esource = "database")
        if(idx<2): print(record)
        #record_json = json.dumps(record)
        record_json = record
        df_json[row.SRR].append(record_json)

    """
    with open(ofname, "a") as output:
        output.write("field:[")
        sep=""
        for k,v in df_json.items():            
            output.write(sep)
            json.dump(v, output)            
            sep=","
        output.write("]")

    """
    with open(ofname, "a") as output:
        output.write("{")
        sep=""
        for k,v in df_json.items():            
            output.write(sep)
            output.write("\""+k+"\"")
            output.write(":")
            record_list = {                
                "growth_conditions": df_json[k]
            }
            json.dump(record_list, output)            
            sep=","
        output.write("}")
    
    df=open(ofname)
    df=json.load(df)