format_zika_v4.py
4.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
"""
#Setup
"""
#################### Setup ####################
from collections import defaultdict
from optparse import OptionParser
import os
from pandas import read_csv, DataFrame, merge, concat, read_table
from numpy import exp, nan, mean
import json
import format_fun_v4 as format_fun
import sys
"""
# input parameters
--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
--outputFile all_srr_IV_mapped.tsv
#Example
# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v4.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v3.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile zika_mapped.json
"""
#################### Defining parameters ####################
if __name__ == "__main__":
parser = OptionParser()
parser.add_option(
"--inputPath",
dest="input_path",
help="Path of npl tagged file (crf output)",
metavar="PATH")
parser.add_option(
"--iAnnotatedFile",
dest="npl_fname",
help="Input file of npl tagged file (crf output)",
metavar="FILE",
default="")
parser.add_option(
"--outputPath",
dest="output_path",
help="Output path to place output files",
metavar="PATH")
parser.add_option(
"--outputFile",
dest="out_fname",
help="Output file name for the mapping process",
metavar="FILE",
default="")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
#################### DISP PARAMETERS ####################
print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
print("--inputPath Path of npl tagged file: " + str(options.input_path))
print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
print("--outputPath Output path to place output files: " + str(options.output_path))
print("--outputFile Output of the mapping process: " + str(options.out_fname))
print("\n\n")
# Input files
npl_ifile = os.path.join(options.input_path, options.npl_fname)
#Output files
ofname = os.path.join(options.output_path, options.out_fname)
#################### Load input data ####################
# Load CRF-annotation
exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
npl_full = read_table(npl_ifile, sep = "\t")
obs_cols = set(npl_full.columns)
if exp_cols.intersection(obs_cols) != exp_cols:
ocol = ", ".join(list(exp_cols))
sys.exit(ocol + " expected columns for iAnnotatedFile" )
df_json = defaultdict(list)
for idx,row in npl_full.iterrows():
record = format_fun.created_record(row, source = "ZIKAdb", no_map = True, esource = "database")
if(idx<2): print(record)
#record_json = json.dumps(record)
record_json = record
df_json[row.SRR].append(record_json)
"""
with open(ofname, "a") as output:
output.write("field:[")
sep=""
for k,v in df_json.items():
output.write(sep)
json.dump(v, output)
sep=","
output.write("]")
"""
with open(ofname, "a") as output:
output.write("{")
sep=""
for k,v in df_json.items():
output.write(sep)
output.write("\""+k+"\"")
output.write(":")
record_list = {
"growth_conditions": df_json[k]
}
json.dump(record_list, output)
sep=","
output.write("}")
df=open(ofname)
df=json.load(df)